[llvm] [AMDGPU] Remove SIWholeQuadMode pseudo wavemode optimization (PR #94133)
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Sat Jun 1 18:37:36 PDT 2024
https://github.com/perlfu created https://github.com/llvm/llvm-project/pull/94133
This does not work correctly in divergent control flow. Can be replaced with a later exec mask manipulation optimizer.
This reverts commit a3646ec1bc662e221c2a1d182987257c50958789.
>From 5d6e4c99f6cf9e9348b98fa5661107e4152eb7d5 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Sun, 2 Jun 2024 08:56:11 +0900
Subject: [PATCH] [AMDGPU] Remove SIWholeQuadMode pseudo wavemode optimization
This does not work correctly in divergent control flow.
Can be replaced with a later exec mask manipulation optimizer.
This reverts commit a3646ec1bc662e221c2a1d182987257c50958789.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 6 ---
llvm/lib/Target/AMDGPU/SIInstructions.td | 15 ------
.../Target/AMDGPU/SIPreAllocateWWMRegs.cpp | 14 ++----
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 49 -------------------
llvm/test/CodeGen/AMDGPU/wqm.ll | 20 +++++++-
5 files changed, 22 insertions(+), 82 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d8e21da8019a7..0edcdb337b5af 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2519,12 +2519,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
- case AMDGPU::ENTER_PSEUDO_WM:
- case AMDGPU::EXIT_PSEUDO_WM: {
- // These do nothing.
- MI.eraseFromParent();
- break;
- }
case AMDGPU::SI_RETURN: {
const MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c1b844f844c32..89c96791bfd1c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -217,21 +217,6 @@ def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$m
def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>;
} // End usesCustomInserter = 1
-// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes.
-def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
- let Uses = [EXEC];
- let Defs = [EXEC];
- let hasSideEffects = 0;
- let mayLoad = 0;
- let mayStore = 0;
-}
-
-def EXIT_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
- let hasSideEffects = 0;
- let mayLoad = 0;
- let mayStore = 0;
-}
-
// Pseudo instructions used for @llvm.fptrunc.round upward
// and @llvm.fptrunc.round downward.
// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 398f870a9f531..5837dbeb3f986 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -165,19 +165,15 @@ SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
- if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM ||
- Opc == AMDGPU::ENTER_PSEUDO_WM) {
+ if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) {
dbgs() << "Entering ";
} else {
- assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM ||
- Opc == AMDGPU::EXIT_PSEUDO_WM);
+ assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM);
dbgs() << "Exiting ";
}
if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) {
dbgs() << "Strict WWM ";
- } else if (Opc == AMDGPU::ENTER_PSEUDO_WM || Opc == AMDGPU::EXIT_PSEUDO_WM) {
- dbgs() << "Pseudo WWM/WQM ";
} else {
assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM);
dbgs() << "Strict WQM ";
@@ -230,16 +226,14 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
}
if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
- MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM ||
- MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) {
+ MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) {
LLVM_DEBUG(printWWMInfo(MI));
InWWM = true;
continue;
}
if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM ||
- MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM ||
- MI.getOpcode() == AMDGPU::EXIT_PSEUDO_WM) {
+ MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) {
LLVM_DEBUG(printWWMInfo(MI));
InWWM = false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 09dc1c781e2f3..e95f1924b45d9 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -214,8 +214,6 @@ class SIWholeQuadMode : public MachineFunctionPass {
MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
bool IsWQM);
MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
- void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
- MachineInstr *Exit);
void lowerBlock(MachineBasicBlock &MBB);
void processBlock(MachineBasicBlock &MBB, bool IsEntry);
@@ -1019,31 +1017,6 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
return NewTerm;
}
-// Convert a strict mode transition to a pseudo transition.
-// This still pre-allocates registers to prevent clobbering,
-// but avoids any EXEC mask changes.
-void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
- MachineInstr *Entry,
- MachineInstr *Exit) {
- assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
- assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);
-
- Register SaveOrig = Entry->getOperand(0).getReg();
-
- MachineInstr *NewEntry =
- BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
- MachineInstr *NewExit =
- BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));
-
- LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);
- Exit->eraseFromParent();
-
- LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);
- Entry->eraseFromParent();
-
- LIS->removeInterval(SaveOrig);
-}
-
// Replace (or supplement) instructions accessing live mask.
// This can only happen once all the live mask registers have been created
// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
@@ -1060,12 +1033,9 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
SmallVector<MachineInstr *, 4> SplitPoints;
char State = BI.InitialState;
- MachineInstr *StrictEntry = nullptr;
for (MachineInstr &MI : llvm::make_early_inc_range(
llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
- char PreviousState = State;
-
if (StateTransition.count(&MI))
State = StateTransition[&MI];
@@ -1078,20 +1048,6 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
SplitPoint = lowerKillF32(MBB, MI);
break;
- case AMDGPU::ENTER_STRICT_WQM:
- StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
- break;
- case AMDGPU::EXIT_STRICT_WQM:
- if (State == StateWQM && StrictEntry) {
- // Transition WQM -> StrictWQM -> WQM detected.
- lowerPseudoStrictMode(MBB, StrictEntry, &MI);
- }
- StrictEntry = nullptr;
- break;
- case AMDGPU::ENTER_STRICT_WWM:
- case AMDGPU::EXIT_STRICT_WWM:
- StrictEntry = nullptr;
- break;
default:
break;
}
@@ -1245,11 +1201,6 @@ void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
}
LIS->InsertMachineInstrInMaps(*MI);
StateTransition[MI] = StrictStateNeeded;
-
- // Mark block as needing lower so it will be checked for unnecessary transitions.
- auto BII = Blocks.find(&MBB);
- if (BII != Blocks.end())
- BII->second.NeedsLowering = true;
}
void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 95dfb12c8dbae..a2d6c4c10a5ba 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -2872,18 +2872,24 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
+; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec
+; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15]
; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB50_2
; GFX9-W64-NEXT: ; %bb.1: ; %IF
+; GFX9-W64-NEXT: s_mov_b64 s[16:17], exec
+; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
+; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0
@@ -2896,18 +2902,24 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1
; GFX10-W32-NEXT: s_cbranch_execz .LBB50_2
; GFX10-W32-NEXT: ; %bb.1: ; %IF
+; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo
+; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
+; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0
@@ -3192,8 +3204,10 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1
; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen
-; GFX9-W64-NEXT: s_nop 0
+; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
+; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
@@ -3234,9 +3248,11 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen
+; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
+; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
+; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
More information about the llvm-commits
mailing list