[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 valu write vdst + permlane read hazard (PR #117287)

Matt Arsenault via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Fri Nov 22 12:18:23 PST 2024


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/117287

>From 73457dc7df855747568559c9b9c626f6efef01d1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 18 Mar 2024 14:41:11 +0530
Subject: [PATCH] AMDGPU: Handle gfx950 valu write vdst + permlane read hazard

---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  30 ++++-
 llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir   | 113 ++++++++++++++++++
 .../AMDGPU/llvm.amdgcn.permlane16.swap.ll     |   6 +
 .../AMDGPU/llvm.amdgcn.permlane32.swap.ll     |   6 +
 4 files changed, 153 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 45ff1f4a63cf03..19651f2557b46d 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2552,8 +2552,34 @@ int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
     return isVCmpXWritesExec(*TII, *TRI, MI);
   };
 
-  const int NumWaitStates = 4;
-  return NumWaitStates - getWaitStatesSince(IsVCmpXWritesExecFn, NumWaitStates);
+  auto IsVALUFn = [](const MachineInstr &MI) {
+    return SIInstrInfo::isVALU(MI);
+  };
+
+  const int VCmpXWritesExecWaitStates = 4;
+  const int VALUWritesVDstWaitStates = 2;
+  int WaitStatesNeeded = 0;
+
+  for (const MachineOperand &Op : MI->explicit_uses()) {
+    if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
+      continue;
+    Register Reg = Op.getReg();
+
+    int WaitStatesSinceDef =
+        VALUWritesVDstWaitStates -
+        getWaitStatesSinceDef(Reg, IsVALUFn,
+                              /*MaxWaitStates=*/VALUWritesVDstWaitStates);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
+    if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
+      break;
+  }
+
+  int VCmpXHazardWaits =
+      VCmpXWritesExecWaitStates -
+      getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
+
+  WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
+  return WaitStatesNeeded;
 }
 
 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
index 97bef7be711ff2..75834316750951 100644
--- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
@@ -142,3 +142,116 @@ body:             |
     $vgpr4 = V_MOV_B32_e32 0, implicit $exec
     renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
 ...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name:            valu_write_vdst_read_permlane16_swap_0
+body:             |
+  bb.0:
+    liveins: $vgpr1
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_1
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name:            valu_write_vdst_read_permlane16_swap_1
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_0
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name:            valu_write_vdst_read_permlane32_swap_0
+body:             |
+  bb.0:
+    liveins: $vgpr1
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_1
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name:            valu_write_vdst_read_permlane32_swap_1
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# No hazard, write of other register
+# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0_otherreg
+# GCN:      V_MOV_B32
+# GCN-NEXT: V_PERMLANE
+name:            valu_write_vdst_read_permlane16_swap_0_otherreg
+body:             |
+  bb.0:
+    liveins: $vgpr1
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# Both permlane hazards at once.
+# GCN-LABEL: name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap
+# GCN:      V_MOV_B32
+# GCN:      V_CMPX_EQ_I32
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_PERMLANE
+name:            valu_writes_vdst__vcmpx_write_exec__permlane32_swap
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $vgpr3
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap
+# GCN:      V_CMPX_EQ_I32
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_PERMLANE
+name:            vcmpx_write_exec__valu_writes_vdst___permlane32_swap
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $vgpr3
+    $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap
+# GCN:      V_CMPX_EQ_I32
+# GCN:      V_MOV_B32
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name:            vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $vgpr3
+    $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
index 0d5dfa46c2c260..e1cebe28f7fe8a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
@@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vi(i32 %vdst_old) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
@@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vl(i32 %vdst_old) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0xc1d1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
@@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane16_swap_b32_iv(i32 %src0_old) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 1, i32 %src0_old, i1 false, i1 false)
@@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane16_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane16_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
index e3b0879af4307d..121c379053fcf7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
@@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vi(i32 %vdst_old) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
@@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vl(i32 %vdst_old) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0xc1d1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
@@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane32_swap_b32_iv(i32 %src0_old) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 1, i32 %src0_old, i1 false, i1 false)
@@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane32_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane32_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)



More information about the llvm-branch-commits mailing list