[llvm] 27a8afa - AMDGPU: Handle gfx950 valu write vdst + permlane read hazard (#117287)

via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 25 09:33:07 PST 2024


Author: Matt Arsenault
Date: 2024-11-25T09:33:04-08:00
New Revision: 27a8afa3fcf7e0378dff65cf3374f7a4e4e2b9a6

URL: https://github.com/llvm/llvm-project/commit/27a8afa3fcf7e0378dff65cf3374f7a4e4e2b9a6
DIFF: https://github.com/llvm/llvm-project/commit/27a8afa3fcf7e0378dff65cf3374f7a4e4e2b9a6.diff

LOG: AMDGPU: Handle gfx950 valu write vdst + permlane read hazard (#117287)

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
    llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 97995560842090..4c37ef8855a5ba 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2551,8 +2551,34 @@ int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
     return isVCmpXWritesExec(*TII, *TRI, MI);
   };
 
-  const int NumWaitStates = 4;
-  return NumWaitStates - getWaitStatesSince(IsVCmpXWritesExecFn, NumWaitStates);
+  auto IsVALUFn = [](const MachineInstr &MI) {
+    return SIInstrInfo::isVALU(MI);
+  };
+
+  const int VCmpXWritesExecWaitStates = 4;
+  const int VALUWritesVDstWaitStates = 2;
+  int WaitStatesNeeded = 0;
+
+  for (const MachineOperand &Op : MI->explicit_uses()) {
+    if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
+      continue;
+    Register Reg = Op.getReg();
+
+    int WaitStatesSinceDef =
+        VALUWritesVDstWaitStates -
+        getWaitStatesSinceDef(Reg, IsVALUFn,
+                              /*MaxWaitStates=*/VALUWritesVDstWaitStates);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
+    if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
+      break;
+  }
+
+  int VCmpXHazardWaits =
+      VCmpXWritesExecWaitStates -
+      getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
+
+  WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
+  return WaitStatesNeeded;
 }
 
 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {

diff  --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
index 97bef7be711ff2..75834316750951 100644
--- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
@@ -142,3 +142,116 @@ body:             |
     $vgpr4 = V_MOV_B32_e32 0, implicit $exec
     renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
 ...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name:            valu_write_vdst_read_permlane16_swap_0
+body:             |
+  bb.0:
+    liveins: $vgpr1
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_1
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name:            valu_write_vdst_read_permlane16_swap_1
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_0
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name:            valu_write_vdst_read_permlane32_swap_0
+body:             |
+  bb.0:
+    liveins: $vgpr1
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_1
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name:            valu_write_vdst_read_permlane32_swap_1
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# No hazard, write of other register
+# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0_otherreg
+# GCN:      V_MOV_B32
+# GCN-NEXT: V_PERMLANE
+name:            valu_write_vdst_read_permlane16_swap_0_otherreg
+body:             |
+  bb.0:
+    liveins: $vgpr1
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# Both permlane hazards at once.
+# GCN-LABEL: name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap
+# GCN:      V_MOV_B32
+# GCN:      V_CMPX_EQ_I32
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_PERMLANE
+name:            valu_writes_vdst__vcmpx_write_exec__permlane32_swap
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $vgpr3
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap
+# GCN:      V_CMPX_EQ_I32
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_PERMLANE
+name:            vcmpx_write_exec__valu_writes_vdst___permlane32_swap
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $vgpr3
+    $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap
+# GCN:      V_CMPX_EQ_I32
+# GCN:      V_MOV_B32
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name:            vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $vgpr3
+    $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
index 0d5dfa46c2c260..e1cebe28f7fe8a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
@@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vi(i32 %vdst_old) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
@@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vl(i32 %vdst_old) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0xc1d1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
@@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane16_swap_b32_iv(i32 %src0_old) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 1, i32 %src0_old, i1 false, i1 false)
@@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane16_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane16_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
index e3b0879af4307d..121c379053fcf7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
@@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vi(i32 %vdst_old) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
@@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vl(i32 %vdst_old) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0xc1d1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
@@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane32_swap_b32_iv(i32 %src0_old) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 1, i32 %src0_old, i1 false, i1 false)
@@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane32_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane32_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)


        


More information about the llvm-commits mailing list