[llvm] [AMDGPU] Fix interaction between WQM and llvm.amdgcn.init.exec (PR #93680)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 7 02:20:23 PDT 2024
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/93680
>From 56d41fab750c0ffcdbd97a6e2d41c09643bcf5fc Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 29 May 2024 14:13:37 +0100
Subject: [PATCH 1/2] [AMDGPU] New test for WQM and llvm.amdgcn.init.exec
---
llvm/test/CodeGen/AMDGPU/wqm.ll | 46 +++++++++++++++++++++++++++++++++
1 file changed, 46 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 6fcf5067b0225..967a394a1d92c 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3395,6 +3395,52 @@ main_body:
ret void
}
+; Test the interaction between wqm and llvm.amdgcn.init.exec.
+define amdgpu_gs void @wqm_init_exec() {
+; GFX9-W64-LABEL: wqm_init_exec:
+; GFX9-W64: ; %bb.0: ; %bb
+; GFX9-W64-NEXT: s_mov_b64 exec, -1
+; GFX9-W64-NEXT: s_mov_b32 s0, 0
+; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-W64-NEXT: s_mov_b32 s1, s0
+; GFX9-W64-NEXT: s_mov_b32 s2, s0
+; GFX9-W64-NEXT: s_mov_b32 s3, s0
+; GFX9-W64-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-W64-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-W64-NEXT: s_wqm_b64 exec, exec
+; GFX9-W64-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $exec
+; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-W64-NEXT: ds_write_b32 v0, v1
+; GFX9-W64-NEXT: s_endpgm
+;
+; GFX10-W32-LABEL: wqm_init_exec:
+; GFX10-W32: ; %bb.0: ; %bb
+; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
+; GFX10-W32-NEXT: s_mov_b32 exec_lo, -1
+; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-W32-NEXT: s_mov_b32 s0, 0
+; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-W32-NEXT: s_mov_b32 s2, s0
+; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1
+; GFX10-W32-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-W32-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-W32-NEXT: v_mov_b32_e32 v4, s0
+; GFX10-W32-NEXT: s_mov_b32 s1, s0
+; GFX10-W32-NEXT: s_mov_b32 s3, s0
+; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-W32-NEXT: ds_write_b32 v0, v4
+; GFX10-W32-NEXT: s_endpgm
+bb:
+ call void @llvm.amdgcn.init.exec(i64 -1)
+ call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
+ %i = call i32 @llvm.amdgcn.wqm.i32(i32 0)
+ store i32 %i, i32 addrspace(3)* null, align 4
+ ret void
+}
+
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
>From 0e942f61bd5c28413e8d7ca43c9edd9be8d149b0 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 29 May 2024 13:55:55 +0100
Subject: [PATCH 2/2] [AMDGPU] Fix interaction between WQM and
llvm.amdgcn.init.exec
Whole quad mode requires inserting a copy of the initial EXEC mask. In a
function that also uses llvm.amdgcn.init.exec, insert the COPY after
initializing EXEC.
---
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 29 +++++++++++++++-------
llvm/test/CodeGen/AMDGPU/wqm.ll | 2 +-
2 files changed, 21 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 5b4c44302fa62..913942dda19d9 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -225,7 +225,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
void lowerCopyInstrs();
void lowerKillInstrs(bool IsWQM);
void lowerInitExec(MachineInstr &MI);
- void lowerInitExecInstrs();
+ MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry);
public:
static char ID;
@@ -1648,9 +1648,23 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
LIS->createAndComputeVirtRegInterval(CountReg);
}
-void SIWholeQuadMode::lowerInitExecInstrs() {
- for (MachineInstr *MI : InitExecInstrs)
+/// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
+/// for instructions that depend on EXEC.
+MachineBasicBlock::iterator
+SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) {
+ MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
+
+ for (MachineInstr *MI : InitExecInstrs) {
+ // Try to handle undefined cases gracefully:
+ // - multiple INIT_EXEC instructions
+ // - INIT_EXEC instructions not in the entry block
+ if (MI->getParent() == &Entry)
+ InsertPt = std::next(MI->getIterator());
+
lowerInitExec(*MI);
+ }
+
+ return InsertPt;
}
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
@@ -1701,19 +1715,16 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LiveMaskReg = Exec;
+ MachineBasicBlock &Entry = MF.front();
+ MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry);
+
// Shader is simple does not need any state changes or any complex lowering
if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
LowerToMovInstrs.empty() && KillInstrs.empty()) {
- lowerInitExecInstrs();
lowerLiveMaskQueries();
return !InitExecInstrs.empty() || !LiveMaskQueries.empty();
}
- lowerInitExecInstrs();
-
- MachineBasicBlock &Entry = MF.front();
- MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
-
// Store a copy of the original live mask when required
if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 967a394a1d92c..3bf6c104a0254 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3417,8 +3417,8 @@ define amdgpu_gs void @wqm_init_exec() {
;
; GFX10-W32-LABEL: wqm_init_exec:
; GFX10-W32: ; %bb.0: ; %bb
-; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-W32-NEXT: s_mov_b32 exec_lo, -1
+; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_mov_b32 s0, 0
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
More information about the llvm-commits
mailing list