[llvm] 7ecf196 - [AMDGPU] Fix and extend vccz workarounds

Wed Nov 18 07:28:35 PST 2020

Author: Jay Foad
Date: 2020-11-18T15:26:06Z
New Revision: 7ecf19697ee21a23fc8782daefcb268085874d7f

URL: https://github.com/llvm/llvm-project/commit/7ecf19697ee21a23fc8782daefcb268085874d7f
DIFF: https://github.com/llvm/llvm-project/commit/7ecf19697ee21a23fc8782daefcb268085874d7f.diff

LOG: [AMDGPU] Fix and extend vccz workarounds

We have workarounds for two different cases where vccz can get out of
sync with the value in vcc. This fixes them in two ways:

1. Fix the case where the def of vcc was in a previous basic block, by
pessimistically assuming that vccz might be incorrect at a basic block
boundary.

2. Fix the handling of pre-existing waitcnt instructions by calling
generateWaitcntInstBefore before examining ScoreBrackets to determine
whether there's an outstanding smem read operation.

Differential Revision: https://reviews.llvm.org/D91636

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/test/CodeGen/AMDGPU/infinite-loop.ll
    llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
    llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
    llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
    llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 8fbfab9f2d77..c80263df4cb3 100644

--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1428,9 +1428,19 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     ScoreBrackets.dump();
   });
 
-  // Assume VCCZ is correct at basic block boundaries, unless and until we need
-  // to handle cases where that is not true.
+  // Track the correctness of vccz through this basic block. There are two
+  // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
+  // ST->partialVCCWritesUpdateVCCZ().
   bool VCCZCorrect = true;
+  if (ST->hasReadVCCZBug()) {
+    // vccz could be incorrect at a basic block boundary if a predecessor wrote
+    // to vcc and then issued an smem load.
+    VCCZCorrect = false;
+  } else if (!ST->partialVCCWritesUpdateVCCZ()) {
+    // vccz could be incorrect at a basic block boundary if a predecessor wrote
+    // to vcc_lo or vcc_hi.
+    VCCZCorrect = false;
+  }
 
   // Walk over the instructions.
   MachineInstr *OldWaitcntInstr = nullptr;
@@ -1451,15 +1461,21 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       continue;
     }
 
-    // We might need to restore vccz to its correct value for either of two
-    // 
diff erent reasons; see ST->hasReadVCCZBug() and
-    // ST->partialVCCWritesUpdateVCCZ().
-    bool RestoreVCCZ = false;
-    if (readsVCCZ(Inst)) {
-      if (!VCCZCorrect) {
-        // Restore vccz if it's not known to be correct already.
-        RestoreVCCZ = true;
-      } else if (ST->hasReadVCCZBug()) {
+    // Generate an s_waitcnt instruction to be placed before Inst, if needed.
+    Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
+    OldWaitcntInstr = nullptr;
+
+    // Restore vccz if it's not known to be correct already.
+    bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
+
+    // Don't examine operands unless we need to track vccz correctness.
+    if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
+      if (Inst.definesRegister(AMDGPU::VCC_LO) ||
+          Inst.definesRegister(AMDGPU::VCC_HI)) {
+        // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
+        if (!ST->partialVCCWritesUpdateVCCZ())
+          VCCZCorrect = false;
+      } else if (Inst.definesRegister(AMDGPU::VCC)) {
         // There is a hardware bug on CI/SI where SMRD instruction may corrupt
         // vccz bit, so when we detect that an instruction may read from a
         // corrupt vccz bit, we need to:
@@ -1467,12 +1483,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
         //    operations to complete.
         // 2. Restore the correct value of vccz by writing the current value
         //    of vcc back to vcc.
-        if (ScoreBrackets.getScoreLB(LGKM_CNT) <
-            ScoreBrackets.getScoreUB(LGKM_CNT) &&
+        if (ST->hasReadVCCZBug() &&
+            ScoreBrackets.getScoreLB(LGKM_CNT) <
+                ScoreBrackets.getScoreUB(LGKM_CNT) &&
             ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
-          // Restore vccz if there's an outstanding smem read, which could
-          // complete and clobber vccz at any time.
-          RestoreVCCZ = true;
+          // Writes to vcc while there's an outstanding smem read may get
+          // clobbered as soon as any read completes.
+          VCCZCorrect = false;
+        } else {
+          // Writes to vcc will fix any incorrect value in vccz.
+          VCCZCorrect = true;
         }
       }
     }
@@ -1482,24 +1502,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
         const Value *Ptr = Memop->getValue();
         SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
       }
-    }
-
-    if (!ST->partialVCCWritesUpdateVCCZ()) {
-      if (Inst.definesRegister(AMDGPU::VCC_LO) ||
-          Inst.definesRegister(AMDGPU::VCC_HI)) {
-        // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
+      if (ST->hasReadVCCZBug()) {
+        // This smem read could complete and clobber vccz at any time.
         VCCZCorrect = false;
-      } else if (Inst.definesRegister(AMDGPU::VCC)) {
-        // Writes to vcc will fix any incorrect value in vccz.
-        VCCZCorrect = true;
       }
     }
 
-    // Generate an s_waitcnt instruction to be placed before
-    // cur_Inst, if needed.
-    Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
-    OldWaitcntInstr = nullptr;
-
     updateEventWaitcntAfter(Inst, &ScoreBrackets);
 
 #if 0 // TODO: implement resource type check controlled by options with ub = LB.

diff  --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index d82d90564aa4..b488c48903ce 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -45,6 +45,7 @@ define amdgpu_kernel void @infinite_loop_ret(i32 addrspace(1)* %out) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_mov_b64 vcc, vcc
 ; SI-NEXT:    s_cbranch_vccnz BB1_2
 ; SI-NEXT:  BB1_3: ; %UnifiedReturnBlock
 ; SI-NEXT:    s_endpgm
@@ -86,6 +87,7 @@ define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_mov_b64 vcc, vcc
 ; SI-NEXT:    s_cbranch_vccnz BB2_2
 ; SI-NEXT:  ; %bb.3: ; %Flow
 ; SI-NEXT:    s_mov_b64 s[2:3], 0
@@ -103,6 +105,7 @@ define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
 ; SI-NEXT:  BB2_6: ; %loop1
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_mov_b64 vcc, vcc
 ; SI-NEXT:    s_cbranch_vccz BB2_6
 ; SI-NEXT:  BB2_7: ; %DummyReturnBlock
 ; SI-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index 8bdc05bafacd..03a49f199350 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -42,6 +42,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca
 ; GCN-NEXT:    s_and_b64 vcc, exec, 0
 ; GCN-NEXT:  BB0_6: ; %bb9
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_mov_b64 vcc, vcc
 ; GCN-NEXT:    s_cbranch_vccz BB0_6
 ; GCN-NEXT:  BB0_7: ; %DummyReturnBlock
 ; GCN-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 06f09e8e4d07..8f1e2489e6fd 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -227,6 +227,7 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
 ; SI-NEXT:    s_and_b64 vcc, exec, 0
 ; SI-NEXT:  BB3_12: ; %self.loop
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    s_mov_b64 vcc, vcc
 ; SI-NEXT:    s_cbranch_vccz BB3_12
 ; SI-NEXT:  BB3_13: ; %DummyReturnBlock
 ; SI-NEXT:    s_endpgm
@@ -298,6 +299,7 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
 ; FLAT-NEXT:    s_and_b64 vcc, exec, 0
 ; FLAT-NEXT:  BB3_12: ; %self.loop
 ; FLAT-NEXT:    ; =>This Inner Loop Header: Depth=1
+; FLAT-NEXT:    s_mov_b64 vcc, vcc
 ; FLAT-NEXT:    s_cbranch_vccz BB3_12
 ; FLAT-NEXT:  BB3_13: ; %DummyReturnBlock
 ; FLAT-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
index d1dac8c8c782..6bf4fcef66a8 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
@@ -15,6 +15,7 @@ define amdgpu_kernel void @test(i32 %arg, i32 %arg1) {
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
 ; CHECK-NEXT:  BB0_2: ; %bb10
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccz BB0_2
 ; CHECK-NEXT:  BB0_3: ; %DummyReturnBlock
 ; CHECK-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index 79233141070d..2d93f0ec00ce 100644
--- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -168,6 +168,8 @@ body: |
 
 # CHECK-LABEL: name: vcc_def_pred
 # CHECK: bb.1:
+# SI:  $vcc = S_MOV_B64 $vcc
+# GFX9:  $vcc = S_MOV_B64 $vcc
 # CHECK: S_CBRANCH_VCCZ %bb.2, implicit $vcc
 
 name: vcc_def_pred
@@ -243,7 +245,7 @@ body: |
 # SI-NEXT: $vcc = S_MOV_B64 0
 # SI-NEXT: S_WAITCNT 127
 # SI-NEXT: S_NOP 0
-# FIXME should have $vcc = S_MOV_B64 $vcc
+# SI-NEXT: $vcc = S_MOV_B64 $vcc
 # SI-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
 name: load_def_wait_nop_use
 body: |
@@ -298,7 +300,7 @@ body: |
 # SI-NEXT: $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
 # SI-NEXT: S_WAITCNT 127
 # SI-NEXT: S_NOP 0
-# FIXME should have $vcc = S_MOV_B64 $vcc
+# SI-NEXT: $vcc = S_MOV_B64 $vcc
 # SI-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
 name: def_load_wait_nop_use
 body: |