[llvm] 27a8afa - AMDGPU: Handle gfx950 valu write vdst + permlane read hazard (#117287)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 25 09:33:07 PST 2024
Author: Matt Arsenault
Date: 2024-11-25T09:33:04-08:00
New Revision: 27a8afa3fcf7e0378dff65cf3374f7a4e4e2b9a6
URL: https://github.com/llvm/llvm-project/commit/27a8afa3fcf7e0378dff65cf3374f7a4e4e2b9a6
DIFF: https://github.com/llvm/llvm-project/commit/27a8afa3fcf7e0378dff65cf3374f7a4e4e2b9a6.diff
LOG: AMDGPU: Handle gfx950 valu write vdst + permlane read hazard (#117287)
Added:
Modified:
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 97995560842090..4c37ef8855a5ba 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2551,8 +2551,34 @@ int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
return isVCmpXWritesExec(*TII, *TRI, MI);
};
- const int NumWaitStates = 4;
- return NumWaitStates - getWaitStatesSince(IsVCmpXWritesExecFn, NumWaitStates);
+ auto IsVALUFn = [](const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI);
+ };
+
+ const int VCmpXWritesExecWaitStates = 4;
+ const int VALUWritesVDstWaitStates = 2;
+ int WaitStatesNeeded = 0;
+
+ for (const MachineOperand &Op : MI->explicit_uses()) {
+ if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
+ continue;
+ Register Reg = Op.getReg();
+
+ int WaitStatesSinceDef =
+ VALUWritesVDstWaitStates -
+ getWaitStatesSinceDef(Reg, IsVALUFn,
+ /*MaxWaitStates=*/VALUWritesVDstWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
+ if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
+ break;
+ }
+
+ int VCmpXHazardWaits =
+ VCmpXWritesExecWaitStates -
+ getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
+
+ WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
+ return WaitStatesNeeded;
}
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
index 97bef7be711ff2..75834316750951 100644
--- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
@@ -142,3 +142,116 @@ body: |
$vgpr4 = V_MOV_B32_e32 0, implicit $exec
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0
+# GCN: V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name: valu_write_vdst_read_permlane16_swap_0
+body: |
+ bb.0:
+ liveins: $vgpr1
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_1
+# GCN: V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name: valu_write_vdst_read_permlane16_swap_1
+body: |
+ bb.0:
+ liveins: $vgpr0
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_0
+# GCN: V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name: valu_write_vdst_read_permlane32_swap_0
+body: |
+ bb.0:
+ liveins: $vgpr1
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_1
+# GCN: V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name: valu_write_vdst_read_permlane32_swap_1
+body: |
+ bb.0:
+ liveins: $vgpr0
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# No hazard, write of other register
+# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0_otherreg
+# GCN: V_MOV_B32
+# GCN-NEXT: V_PERMLANE
+name: valu_write_vdst_read_permlane16_swap_0_otherreg
+body: |
+ bb.0:
+ liveins: $vgpr1
+ $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# Both permlane hazards at once.
+# GCN-LABEL: name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap
+# GCN: V_MOV_B32
+# GCN: V_CMPX_EQ_I32
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_PERMLANE
+name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr2, $vgpr3
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
+ renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap
+# GCN: V_CMPX_EQ_I32
+# GCN: V_MOV_B32
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_PERMLANE
+name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr2, $vgpr3
+ $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap
+# GCN: V_CMPX_EQ_I32
+# GCN: V_MOV_B32
+# GCN: V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr2, $vgpr3
+ $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
index 0d5dfa46c2c260..e1cebe28f7fe8a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
@@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vi(i32 %vdst_old) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, 1
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
@@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vl(i32 %vdst_old) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, 0xc1d1
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
@@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane16_swap_b32_iv(i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 1, i32 %src0_old, i1 false, i1 false)
@@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane16_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane16_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
index e3b0879af4307d..121c379053fcf7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
@@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vi(i32 %vdst_old) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, 1
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
@@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vl(i32 %vdst_old) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, 0xc1d1
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
@@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane32_swap_b32_iv(i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 1, i32 %src0_old, i1 false, i1 false)
@@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane32_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane32_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
More information about the llvm-commits
mailing list