[llvm-branch-commits] [llvm] AMDGPU: Handle gfx950 valu write vdst + permlane read hazard (PR #117287)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Nov 22 13:11:05 PST 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/117287
>From b4858a252d18dd63aa3b88c2685b41fa9a604b0c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 18 Mar 2024 14:41:11 +0530
Subject: [PATCH] AMDGPU: Handle gfx950 valu write vdst + permlane read hazard
---
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 30 ++++-
llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir | 113 ++++++++++++++++++
.../AMDGPU/llvm.amdgcn.permlane16.swap.ll | 6 +
.../AMDGPU/llvm.amdgcn.permlane32.swap.ll | 6 +
4 files changed, 153 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 97995560842090..4c37ef8855a5ba 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2551,8 +2551,34 @@ int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
return isVCmpXWritesExec(*TII, *TRI, MI);
};
- const int NumWaitStates = 4;
- return NumWaitStates - getWaitStatesSince(IsVCmpXWritesExecFn, NumWaitStates);
+ auto IsVALUFn = [](const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI);
+ };
+
+ const int VCmpXWritesExecWaitStates = 4;
+ const int VALUWritesVDstWaitStates = 2;
+ int WaitStatesNeeded = 0;
+
+ for (const MachineOperand &Op : MI->explicit_uses()) {
+ if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
+ continue;
+ Register Reg = Op.getReg();
+
+ int WaitStatesSinceDef =
+ VALUWritesVDstWaitStates -
+ getWaitStatesSinceDef(Reg, IsVALUFn,
+ /*MaxWaitStates=*/VALUWritesVDstWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
+ if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
+ break;
+ }
+
+ int VCmpXHazardWaits =
+ VCmpXWritesExecWaitStates -
+ getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
+
+ WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
+ return WaitStatesNeeded;
}
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
index 97bef7be711ff2..75834316750951 100644
--- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
@@ -142,3 +142,116 @@ body: |
$vgpr4 = V_MOV_B32_e32 0, implicit $exec
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0
+# GCN: V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name: valu_write_vdst_read_permlane16_swap_0
+body: |
+ bb.0:
+ liveins: $vgpr1
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_1
+# GCN: V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name: valu_write_vdst_read_permlane16_swap_1
+body: |
+ bb.0:
+ liveins: $vgpr0
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_0
+# GCN: V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name: valu_write_vdst_read_permlane32_swap_0
+body: |
+ bb.0:
+ liveins: $vgpr1
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_1
+# GCN: V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name: valu_write_vdst_read_permlane32_swap_1
+body: |
+ bb.0:
+ liveins: $vgpr0
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# No hazard, write of other register
+# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0_otherreg
+# GCN: V_MOV_B32
+# GCN-NEXT: V_PERMLANE
+name: valu_write_vdst_read_permlane16_swap_0_otherreg
+body: |
+ bb.0:
+ liveins: $vgpr1
+ $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# Both permlane hazards at once.
+# GCN-LABEL: name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap
+# GCN: V_MOV_B32
+# GCN: V_CMPX_EQ_I32
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_PERMLANE
+name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr2, $vgpr3
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
+ renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap
+# GCN: V_CMPX_EQ_I32
+# GCN: V_MOV_B32
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_PERMLANE
+name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr2, $vgpr3
+ $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap
+# GCN: V_CMPX_EQ_I32
+# GCN: V_MOV_B32
+# GCN: V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr2, $vgpr3
+ $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
index 0d5dfa46c2c260..e1cebe28f7fe8a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
@@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vi(i32 %vdst_old) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, 1
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
@@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vl(i32 %vdst_old) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, 0xc1d1
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
@@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane16_swap_b32_iv(i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 1, i32 %src0_old, i1 false, i1 false)
@@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane16_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane16_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
index e3b0879af4307d..121c379053fcf7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
@@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vi(i32 %vdst_old) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, 1
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
@@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vl(i32 %vdst_old) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, 0xc1d1
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
@@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane32_swap_b32_iv(i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 1, i32 %src0_old, i1 false, i1 false)
@@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane32_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane32_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
More information about the llvm-branch-commits
mailing list