[llvm] a3646ec - [AMDGPU] Add pseudo wavemode to optimize strict_wqm
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 27 17:46:01 PDT 2022
Author: Carl Ritson
Date: 2022-10-28T09:45:17+09:00
New Revision: a3646ec1bc662e221c2a1d182987257c50958789
URL: https://github.com/llvm/llvm-project/commit/a3646ec1bc662e221c2a1d182987257c50958789
DIFF: https://github.com/llvm/llvm-project/commit/a3646ec1bc662e221c2a1d182987257c50958789.diff
LOG: [AMDGPU] Add pseudo wavemode to optimize strict_wqm
Strict WQM does not require a WQM transistion if it occurs within
an existing WQM section.
This occurs heavily in GFX11 pixel shaders with LDS_PARAM_LOAD.
Which leads to unnecessary EXEC mask manipulation.
To avoid these transitions, detect WQM -> Strict WQM -> WQM
and substitute new ENTER_PSEUDO_WM/EXIT_PSEUDO_WM markers instead.
These are treat similarly by WWM register pre-allocation pass,
but do not manipulate EXEC or use registers to save EXEC state.
Reviewed By: piotr
Differential Revision: https://reviews.llvm.org/D136813
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
llvm/test/CodeGen/AMDGPU/wqm.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 640e401677e45..1a3c11cf1d014 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2142,6 +2142,12 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
+ case AMDGPU::ENTER_PSEUDO_WM:
+ case AMDGPU::EXIT_PSEUDO_WM: {
+ // These do nothing.
+ MI.eraseFromParent();
+ break;
+ }
case AMDGPU::SI_RETURN: {
const MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 5bd7f8572d091..a421b0346684c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -188,6 +188,21 @@ def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
let mayStore = 0;
}
+// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes.
+def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
+ let Uses = [EXEC];
+ let Defs = [EXEC];
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+
+def EXIT_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+
// Pseudo instructions used for @llvm.fptrunc.round upward
// and @llvm.fptrunc.round downward.
// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 4fab13bb44b1f..c2ddfd7881ab7 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -163,15 +163,19 @@ SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
- if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) {
+ if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM ||
+ Opc == AMDGPU::ENTER_PSEUDO_WM) {
dbgs() << "Entering ";
} else {
- assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM);
+ assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM ||
+ Opc == AMDGPU::EXIT_PSEUDO_WM);
dbgs() << "Exiting ";
}
if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) {
dbgs() << "Strict WWM ";
+ } else if (Opc == AMDGPU::ENTER_PSEUDO_WM || Opc == AMDGPU::EXIT_PSEUDO_WM) {
+ dbgs() << "Pseudo WWM/WQM ";
} else {
assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM);
dbgs() << "Strict WQM ";
@@ -214,14 +218,16 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
RegsAssigned |= processDef(MI.getOperand(0));
if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
- MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) {
+ MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM ||
+ MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) {
LLVM_DEBUG(printWWMInfo(MI));
InWWM = true;
continue;
}
if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM ||
- MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) {
+ MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM ||
+ MI.getOpcode() == AMDGPU::EXIT_PSEUDO_WM) {
LLVM_DEBUG(printWWMInfo(MI));
InWWM = false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index f4986376742f9..0a80779f8a5cd 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -215,6 +215,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
bool IsWQM);
MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
+ void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
+ MachineInstr *Exit);
void lowerBlock(MachineBasicBlock &MBB);
void processBlock(MachineBasicBlock &MBB, bool IsEntry);
@@ -1040,6 +1042,31 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
return NewTerm;
}
+// Convert a strict mode transition to a pseudo transition.
+// This still pre-allocates registers to prevent clobbering,
+// but avoids any EXEC mask changes.
+void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
+ MachineInstr *Entry,
+ MachineInstr *Exit) {
+ assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
+ assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);
+
+ Register SaveOrig = Entry->getOperand(0).getReg();
+
+ MachineInstr *NewEntry =
+ BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
+ MachineInstr *NewExit =
+ BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));
+
+ LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);
+ Exit->eraseFromParent();
+
+ LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);
+ Entry->eraseFromParent();
+
+ LIS->removeInterval(SaveOrig);
+}
+
// Replace (or supplement) instructions accessing live mask.
// This can only happen once all the live mask registers have been created
// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
@@ -1056,9 +1083,12 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
SmallVector<MachineInstr *, 4> SplitPoints;
char State = BI.InitialState;
+ MachineInstr *StrictEntry = nullptr;
for (MachineInstr &MI : llvm::make_early_inc_range(
llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
+ char PreviousState = State;
+
if (StateTransition.count(&MI))
State = StateTransition[&MI];
@@ -1071,6 +1101,20 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
SplitPoint = lowerKillF32(MBB, MI);
break;
+ case AMDGPU::ENTER_STRICT_WQM:
+ StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
+ break;
+ case AMDGPU::EXIT_STRICT_WQM:
+ if (State == StateWQM && StrictEntry) {
+ // Transition WQM -> StrictWQM -> WQM detected.
+ lowerPseudoStrictMode(MBB, StrictEntry, &MI);
+ }
+ StrictEntry = nullptr;
+ break;
+ case AMDGPU::ENTER_STRICT_WWM:
+ case AMDGPU::EXIT_STRICT_WWM:
+ StrictEntry = nullptr;
+ break;
default:
break;
}
@@ -1213,7 +1257,12 @@ void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
.addImm(-1);
}
LIS->InsertMachineInstrInMaps(*MI);
- StateTransition[MI] = StateStrictWWM;
+ StateTransition[MI] = StrictStateNeeded;
+
+ // Mark block as needing lower so it will be checked for unnecessary transitions.
+ auto BII = Blocks.find(&MBB);
+ if (BII != Blocks.end())
+ BII->second.NeedsLowering = true;
}
void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 16c30174657a5..2167a5ab8f42d 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -2820,24 +2820,18 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
-; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec
-; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15]
; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB46_2
; GFX9-W64-NEXT: ; %bb.1: ; %IF
-; GFX9-W64-NEXT: s_mov_b64 s[16:17], exec
-; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
-; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0
@@ -2850,24 +2844,18 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
-; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1
; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2
; GFX10-W32-NEXT: ; %bb.1: ; %IF
-; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo
-; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
-; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0
@@ -3150,10 +3138,8 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1
; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen
-; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-W64-NEXT: s_wqm_b64 exec, exec
+; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
-; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
@@ -3194,11 +3180,9 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen
-; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
-; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
-; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
More information about the llvm-commits
mailing list