[llvm] 4a331be - [AMDGPU] Fix vccz after v_readlane/v_readfirstlane to vcc_lo/hi

Tue Jan 28 02:52:44 PST 2020

Author: Jay Foad
Date: 2020-01-28T10:52:17Z
New Revision: 4a331beadc3aaeb24a88853d2703f4ac7d513df1

URL: https://github.com/llvm/llvm-project/commit/4a331beadc3aaeb24a88853d2703f4ac7d513df1
DIFF: https://github.com/llvm/llvm-project/commit/4a331beadc3aaeb24a88853d2703f4ac7d513df1.diff

LOG: [AMDGPU] Fix vccz after v_readlane/v_readfirstlane to vcc_lo/hi

Summary:
Up to gfx9, writes to vcc_lo and vcc_hi by instructions like
v_readlane and v_readfirstlane do not update vccz to reflect the new
value of vcc. Fix it by reusing part of the existing vccz bug handling
code, which inserts an "s_mov_b64 vcc, vcc" instruction to restore vccz
just before an instruction that needs the correct value.

Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D69661

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
    llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 61db7ba37e04..99cb4e9e5d4a 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -587,6 +587,11 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
     return getGeneration() <= SEA_ISLANDS;
   }
 
+  /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
+  bool partialVCCWritesUpdateVCCZ() const {
+    return getGeneration() >= GFX10;
+  }
+
   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
   /// was written by a VALU instruction.
   bool hasSMRDReadVALUDefHazard() const {

diff  --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ef662d55cb0a..5cf4909bd9b7 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1383,6 +1383,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     ScoreBrackets.dump();
   });
 
+  // Assume VCCZ is correct at basic block boundaries, unless and until we need
+  // to handle cases where that is not true.
+  bool VCCZCorrect = true;
+
   // Walk over the instructions.
   MachineInstr *OldWaitcntInstr = nullptr;
 
@@ -1402,13 +1406,26 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       continue;
     }
 
-    bool VCCZBugWorkAround = false;
+    // We might need to restore vccz to its correct value for either of two
+    // 
diff erent reasons; see ST->hasReadVCCZBug() and
+    // ST->partialVCCWritesUpdateVCCZ().
+    bool RestoreVCCZ = false;
     if (readsVCCZ(Inst)) {
-      if (ScoreBrackets.getScoreLB(LGKM_CNT) <
-              ScoreBrackets.getScoreUB(LGKM_CNT) &&
-          ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
-        if (ST->hasReadVCCZBug())
-          VCCZBugWorkAround = true;
+      if (!VCCZCorrect)
+        RestoreVCCZ = true;
+      else if (ST->hasReadVCCZBug()) {
+        // There is a hardware bug on CI/SI where SMRD instruction may corrupt
+        // vccz bit, so when we detect that an instruction may read from a
+        // corrupt vccz bit, we need to:
+        // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
+        //    operations to complete.
+        // 2. Restore the correct value of vccz by writing the current value
+        //    of vcc back to vcc.
+        if (ScoreBrackets.getScoreLB(LGKM_CNT) <
+            ScoreBrackets.getScoreUB(LGKM_CNT) &&
+            ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
+          RestoreVCCZ = true;
+        }
       }
     }
 
@@ -1419,6 +1436,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       }
     }
 
+    if (!ST->partialVCCWritesUpdateVCCZ()) {
+      // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
+      // Writes to vcc will fix it.
+      if (Inst.definesRegister(AMDGPU::VCC_LO) ||
+          Inst.definesRegister(AMDGPU::VCC_HI))
+        VCCZCorrect = false;
+      else if (Inst.definesRegister(AMDGPU::VCC))
+        VCCZCorrect = true;
+    }
+
     // Generate an s_waitcnt instruction to be placed before
     // cur_Inst, if needed.
     Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
@@ -1444,7 +1471,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
     // TODO: Remove this work-around after fixing the scheduler and enable the
     // assert above.
-    if (VCCZBugWorkAround) {
+    if (RestoreVCCZ) {
       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
       // bit is updated, so we can restore the bit by reading the value of
       // vcc and then writing it back to the register.
@@ -1452,6 +1479,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
               TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
               TRI->getVCC())
           .addReg(TRI->getVCC());
+      VCCZCorrect = true;
       Modified = true;
     }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index d3262abad36a..686ff6f3b0c9 100644
--- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -85,3 +85,81 @@ body: |
     S_ENDPGM 0
 
 ...
+---
+# Test that after reloading vcc spilled to a vgpr, we insert any necessary
+# instructions to fix vccz.
+
+# CHECK-LABEL: name: reload_vcc_from_vgpr
+# CHECK: $vcc_lo = V_READLANE_B32_vi $vgpr0, 8, implicit-def $vcc
+# CHECK: $vcc_hi = V_READLANE_B32_vi $vgpr0, 9
+# SI:    $vcc = S_MOV_B64 $vcc
+# GFX9:  $vcc = S_MOV_B64 $vcc
+# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+
+name: reload_vcc_from_vgpr
+body: |
+  bb.0:
+    $vcc_lo = V_READLANE_B32_vi $vgpr0, 8, implicit-def $vcc
+    $vcc_hi = V_READLANE_B32_vi $vgpr0, 9
+    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+  bb.1:
+
+...
+---
+# Test that after reloading vcc spilled to memory, we insert any necessary
+# instructions to fix vccz.
+
+# CHECK-LABEL: name: reload_vcc_from_mem
+# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec
+# CHECK: $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
+# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec
+# CHECK: $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
+# SI:    $vcc = S_MOV_B64 $vcc
+# GFX9:  $vcc = S_MOV_B64 $vcc
+# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+
+name: reload_vcc_from_mem
+body: |
+  bb.0:
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec
+    $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec
+    $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
+    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+  bb.1:
+
+...
+---
+# Test that after inline asm that defines vcc_lo, we insert any necessary
+# instructions to fix vccz.
+
+# CHECK-LABEL: name: inlineasm_def_vcc_lo
+# CHECK: INLINEASM &"; def vcc_lo", 1, 10, implicit-def $vcc_lo
+# SI:    $vcc = S_MOV_B64 $vcc
+# GFX9:  $vcc = S_MOV_B64 $vcc
+# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+
+name: inlineasm_def_vcc_lo
+body: |
+  bb.0:
+    INLINEASM &"; def vcc_lo", 1, 10, implicit-def $vcc_lo
+    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+  bb.1:
+
+...
+---
+# Test that after inline asm that defines vcc, no unnecessary instructions are
+# inserted to fix vccz.
+
+# CHECK-LABEL: name: inlineasm_def_vcc
+# CHECK: INLINEASM &"; def vcc", 1, 10, implicit-def $vcc
+# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+
+name: inlineasm_def_vcc
+body: |
+  bb.0:
+    INLINEASM &"; def vcc", 1, 10, implicit-def $vcc
+    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+  bb.1:
+
+...