[llvm] [AMDGCN][SIWholeQuadMode] Handle case when SI_KILL_I1_TERMINATOR -1,0 is not the only terminator (PR #122922)
Juan Manuel Martinez CaamaƱo via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 29 05:49:14 PST 2025
https://github.com/jmmartinez updated https://github.com/llvm/llvm-project/pull/122922
>From 9f1d438fbe50abd5fd46ded742574bfac6a91301 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Tue, 21 Jan 2025 16:36:17 +0100
Subject: [PATCH 1/3] [NFC][SIWholeQuadMode] Perform less lookups
---
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 33 ++++++++--------------
1 file changed, 12 insertions(+), 21 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 38f86ca3e9afcd..2f1803ee6df59c 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -218,8 +218,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
bool IsWQM);
MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
- void lowerBlock(MachineBasicBlock &MBB);
- void processBlock(MachineBasicBlock &MBB, bool IsEntry);
+ void lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI);
+ void processBlock(MachineBasicBlock &MBB, BlockInfo &BI, bool IsEntry);
bool lowerLiveMaskQueries();
bool lowerCopyInstrs();
@@ -1035,12 +1035,7 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
// Replace (or supplement) instructions accessing live mask.
// This can only happen once all the live mask registers have been created
// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
-void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
- auto *BII = Blocks.find(&MBB);
- if (BII == Blocks.end())
- return;
-
- const BlockInfo &BI = BII->second;
+void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI) {
if (!BI.NeedsLowering)
return;
@@ -1052,8 +1047,9 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
for (MachineInstr &MI : llvm::make_early_inc_range(
llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
- if (StateTransition.count(&MI))
- State = StateTransition[&MI];
+ auto MIState = StateTransition.find(&MI);
+ if (MIState != StateTransition.end())
+ State = MIState->second;
MachineInstr *SplitPoint = nullptr;
switch (MI.getOpcode()) {
@@ -1260,13 +1256,8 @@ void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
StateTransition[MI] = NonStrictState;
}
-void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
- auto *BII = Blocks.find(&MBB);
- if (BII == Blocks.end())
- return;
-
- BlockInfo &BI = BII->second;
-
+void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, BlockInfo &BI,
+ bool IsEntry) {
// This is a non-entry block that is WQM throughout, so no need to do
// anything.
if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
@@ -1809,11 +1800,11 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
if (GlobalFlags & StateWQM)
Blocks[&Entry].InNeeds |= StateWQM;
// Wave mode switching requires full lowering pass.
- for (auto BII : Blocks)
- processBlock(*BII.first, BII.first == &Entry);
+ for (auto &BII : Blocks)
+ processBlock(*BII.first, BII.second, BII.first == &Entry);
// Lowering blocks causes block splitting so perform as a second pass.
- for (auto BII : Blocks)
- lowerBlock(*BII.first);
+ for (auto &BII : Blocks)
+ lowerBlock(*BII.first, BII.second);
Changed = true;
}
>From cfb3ec8896fefa2622beaee2d299bb1124c760d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Wed, 29 Jan 2025 14:39:18 +0100
Subject: [PATCH 2/3] [NFC][SIWholeQuadMode] Remove redundant arguments
---
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 43 +++++++++-------------
1 file changed, 18 insertions(+), 25 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 2f1803ee6df59c..87eb6d9e385d46 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -212,11 +212,9 @@ class SIWholeQuadMode : public MachineFunctionPass {
MachineBasicBlock::iterator Before, Register SavedOrig,
char NonStrictState, char CurrentStrictState);
- MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
-
- MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
- bool IsWQM);
- MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
+ void splitBlock(MachineInstr *TermMI);
+ MachineInstr *lowerKillI1(MachineInstr &MI, bool IsWQM);
+ MachineInstr *lowerKillF32(MachineInstr &MI);
void lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI);
void processBlock(MachineBasicBlock &MBB, BlockInfo &BI, bool IsEntry);
@@ -746,8 +744,8 @@ SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
return Restore;
}
-MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
- MachineInstr *TermMI) {
+void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
+ MachineBasicBlock *BB = TermMI->getParent();
LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
<< *TermMI << "\n");
@@ -796,12 +794,9 @@ MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
.addMBB(SplitBB);
LIS->InsertMachineInstrInMaps(*MI);
}
-
- return SplitBB;
}
-MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
- MachineInstr &MI) {
+MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
assert(LiveMaskReg.isVirtual());
const DebugLoc &DL = MI.getDebugLoc();
@@ -869,6 +864,8 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
llvm_unreachable("invalid ISD:SET cond code");
}
+ MachineBasicBlock &MBB = *MI.getParent();
+
// Pick opcode based on comparison type.
MachineInstr *VcmpMI;
const MachineOperand &Op0 = MI.getOperand(0);
@@ -919,10 +916,11 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
return NewTerm;
}
-MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
- MachineInstr &MI, bool IsWQM) {
+MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
assert(LiveMaskReg.isVirtual());
+ MachineBasicBlock &MBB = *MI.getParent();
+
const DebugLoc &DL = MI.getDebugLoc();
MachineInstr *MaskUpdateMI = nullptr;
@@ -1055,10 +1053,10 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI) {
switch (MI.getOpcode()) {
case AMDGPU::SI_DEMOTE_I1:
case AMDGPU::SI_KILL_I1_TERMINATOR:
- SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
+ SplitPoint = lowerKillI1(MI, State == StateWQM);
break;
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
- SplitPoint = lowerKillF32(MBB, MI);
+ SplitPoint = lowerKillF32(MI);
break;
case AMDGPU::ENTER_STRICT_WWM:
ActiveLanesReg = MI.getOperand(0).getReg();
@@ -1084,12 +1082,8 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI) {
}
// Perform splitting after instruction scan to simplify iteration.
- if (!SplitPoints.empty()) {
- MachineBasicBlock *BB = &MBB;
- for (MachineInstr *MI : SplitPoints) {
- BB = splitBlock(BB, MI);
- }
- }
+ for (MachineInstr *MI : SplitPoints)
+ splitBlock(MI);
}
// Return an iterator in the (inclusive) range [First, Last] at which
@@ -1546,19 +1540,18 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
for (MachineInstr *MI : KillInstrs) {
- MachineBasicBlock *MBB = MI->getParent();
MachineInstr *SplitPoint = nullptr;
switch (MI->getOpcode()) {
case AMDGPU::SI_DEMOTE_I1:
case AMDGPU::SI_KILL_I1_TERMINATOR:
- SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
+ SplitPoint = lowerKillI1(*MI, IsWQM);
break;
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
- SplitPoint = lowerKillF32(*MBB, *MI);
+ SplitPoint = lowerKillF32(*MI);
break;
}
if (SplitPoint)
- splitBlock(MBB, SplitPoint);
+ splitBlock(SplitPoint);
}
return !KillInstrs.empty();
}
>From 4b48c54a14fe36012cb308be9142cec3e4d875fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Tue, 14 Jan 2025 15:11:41 +0100
Subject: [PATCH 3/3] [AMDGCN][SIWholeQuadMode] Rework
splitBlock/lowerKillI1/lowerKillF32 to handle case when SI_KILL_I1_TERMINATOR
-1 0 is not the unique terminator
The lowerKillI1 method wrongly handled cases where it inserted a
new S_BRANCH instruction when the kill was not the only terminator,
and then tried to split the block.
`SI_KILL_I1_TERMINATOR -1,0` doesn't have any effect. Instead of
lowering to an unconditional branch, we remove the instruction and
insert an unconditional branch only if the instruction is the last
terminator. No split is needed in this case (if the last terminator
has been reached, then the whole block was processed).
Also stop generating an unconditional branch in splitBlock: this
branch was redundant since TermMI is promoted to a
terminator that fallsthrough to the next block already.
Solves SWDEV-508819
---
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 37 ++++++++---------
.../AMDGPU/kill-true-in-return-block.ll | 41 +++++++++++++++++++
llvm/test/CodeGen/AMDGPU/wqm.ll | 4 +-
3 files changed, 61 insertions(+), 21 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 87eb6d9e385d46..bfa9ce82e2c192 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -768,11 +768,19 @@ void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
case AMDGPU::S_MOV_B64:
NewOpcode = AMDGPU::S_MOV_B64_term;
break;
- default:
+ case AMDGPU::S_ANDN2_B32:
+ NewOpcode = AMDGPU::S_ANDN2_B32_term;
+ break;
+ case AMDGPU::S_ANDN2_B64:
+ NewOpcode = AMDGPU::S_ANDN2_B64_term;
break;
+ default:
+ llvm_unreachable("Unexpected instruction");
}
- if (NewOpcode)
- TermMI->setDesc(TII->get(NewOpcode));
+
+ // these terminators fallthrough to the next block, no need to add an
+ // unconditional branch to the next block (SplitBB)
+ TermMI->setDesc(TII->get(NewOpcode));
if (SplitBB != BB) {
// Update dominator trees
@@ -787,12 +795,6 @@ void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
MDT->applyUpdates(DTUpdates);
if (PDT)
PDT->applyUpdates(DTUpdates);
-
- // Link blocks
- MachineInstr *MI =
- BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
- .addMBB(SplitBB);
- LIS->InsertMachineInstrInMaps(*MI);
}
}
@@ -901,8 +903,6 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
assert(MBB.succ_size() == 1);
- MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
- .addMBB(*MBB.succ_begin());
// Update live intervals
LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
@@ -911,9 +911,8 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
LIS->InsertMachineInstrInMaps(*ExecMaskMI);
LIS->InsertMachineInstrInMaps(*EarlyTermMI);
- LIS->InsertMachineInstrInMaps(*NewTerm);
- return NewTerm;
+ return ExecMaskMI;
}
MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
@@ -940,17 +939,17 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
.addReg(Exec);
} else {
// Static: kill does nothing
- MachineInstr *NewTerm = nullptr;
- if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
+ bool IsLastTerminator = std::next(MI.getIterator()) == MBB.end();
+ if (!IsLastTerminator) {
LIS->RemoveMachineInstrFromMaps(MI);
} else {
- assert(MBB.succ_size() == 1);
- NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
- .addMBB(*MBB.succ_begin());
+ assert(MBB.succ_size() == 1 && MI.getOpcode() != AMDGPU::SI_DEMOTE_I1);
+ MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
+ .addMBB(*MBB.succ_begin());
LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
}
MBB.remove(&MI);
- return NewTerm;
+ return nullptr;
}
} else {
if (!KillVal) {
diff --git a/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll b/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll
new file mode 100644
index 00000000000000..021c845d5ea6bb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a %s -o - | FileCheck %s
+
+define amdgpu_ps float @kill_true(i1 %.not) {
+; CHECK-LABEL: kill_true:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_mov_b64 s[0:1], exec
+; CHECK-NEXT: s_wqm_b64 exec, exec
+; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; CHECK-NEXT: s_xor_b64 s[4:5], vcc, -1
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB0_2
+; CHECK-NEXT: ; %bb.1: ; %if1
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $exec
+; CHECK-NEXT: v_pk_mov_b32 v[0:1], 0, 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: .LBB0_2: ; %endif1
+; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT: s_and_b64 exec, exec, s[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
+entry:
+ br i1 %.not, label %endif1, label %if1
+
+if1:
+ %C = call float @llvm.amdgcn.wqm.f32(float 0.000000e+00)
+ store float %C, ptr null, align 4
+ br label %endif1
+
+endif1:
+ call void @llvm.amdgcn.kill(i1 true)
+ ret float 0.000000e+00
+}
+
+declare void @llvm.amdgcn.kill(i1)
+
+declare float @llvm.amdgcn.wqm.f32(float)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index deab4075818805..e439cf80e49241 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3361,7 +3361,7 @@ define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) in
; GFX9-W64-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-W64-NEXT: v_cmp_le_f32_e64 vcc, s0, 0
-; GFX9-W64-NEXT: s_andn2_b64 s[4:5], exec, vcc
+; GFX9-W64-NEXT: s_andn2_b64 exec, exec, vcc
; GFX9-W64-NEXT: s_cbranch_scc0 .LBB55_1
; GFX9-W64-NEXT: s_endpgm
; GFX9-W64-NEXT: .LBB55_1:
@@ -3377,7 +3377,7 @@ define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) in
; GFX10-W32-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-W32-NEXT: v_cmp_le_f32_e64 vcc_lo, s0, 0
-; GFX10-W32-NEXT: s_andn2_b32 s4, exec_lo, vcc_lo
+; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo
; GFX10-W32-NEXT: s_cbranch_scc0 .LBB55_1
; GFX10-W32-NEXT: s_endpgm
; GFX10-W32-NEXT: .LBB55_1:
More information about the llvm-commits
mailing list