[llvm] [AMDGPU] Move INIT_EXEC lowering from SILowerControlFlow to SIWholeQuadMode (PR #94452)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 5 06:22:22 PDT 2024
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/94452
>From 5f7ecf7036a668ced623febab110f8620e17d863 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 29 May 2024 13:04:52 +0100
Subject: [PATCH 1/2] [AMDGPU] Move INIT_EXEC lowering from SILowerControlFlow
to SIWholeQuadMode
NFCI; this just preserves SI_INIT_EXEC and SI_INIT_EXEC_FROM_INPUT
instructions a little longer so that we can reliably identify them in
SIWholeQuadMode.
---
llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 103 ------------------
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 103 +++++++++++++++++-
2 files changed, 102 insertions(+), 104 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index f178324dbbe24..5dc3457b5bfae 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -103,8 +103,6 @@ class SILowerControlFlow : public MachineFunctionPass {
MachineBasicBlock *emitEndCf(MachineInstr &MI);
- void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
-
void findMaskOperands(MachineInstr &MI, unsigned OpNo,
SmallVectorImpl<MachineOperand> &Src) const;
@@ -709,95 +707,6 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
return SplitBB;
}
-void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
- MachineInstr &MI) {
- MachineFunction &MF = *MBB->getParent();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- bool IsWave32 = ST.isWave32();
-
- if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
- // This should be before all vector instructions.
- MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
- TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
- .addImm(MI.getOperand(0).getImm());
- if (LIS) {
- LIS->RemoveMachineInstrFromMaps(MI);
- LIS->InsertMachineInstrInMaps(*InitMI);
- }
- MI.eraseFromParent();
- return;
- }
-
- // Extract the thread count from an SGPR input and set EXEC accordingly.
- // Since BFM can't shift by 64, handle that case with CMP + CMOV.
- //
- // S_BFE_U32 count, input, {shift, 7}
- // S_BFM_B64 exec, count, 0
- // S_CMP_EQ_U32 count, 64
- // S_CMOV_B64 exec, -1
- Register InputReg = MI.getOperand(0).getReg();
- MachineInstr *FirstMI = &*MBB->begin();
- if (InputReg.isVirtual()) {
- MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
- assert(DefInstr && DefInstr->isCopy());
- if (DefInstr->getParent() == MBB) {
- if (DefInstr != FirstMI) {
- // If the `InputReg` is defined in current block, we also need to
- // move that instruction to the beginning of the block.
- DefInstr->removeFromParent();
- MBB->insert(FirstMI, DefInstr);
- if (LIS)
- LIS->handleMove(*DefInstr);
- } else {
- // If first instruction is definition then move pointer after it.
- FirstMI = &*std::next(FirstMI->getIterator());
- }
- }
- }
-
- // Insert instruction sequence at block beginning (before vector operations).
- const DebugLoc DL = MI.getDebugLoc();
- const unsigned WavefrontSize = ST.getWavefrontSize();
- const unsigned Mask = (WavefrontSize << 1) - 1;
- Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
- .addReg(InputReg)
- .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
- if (LV)
- LV->recomputeForSingleDefVirtReg(InputReg);
- auto BfmMI =
- BuildMI(*MBB, FirstMI, DL,
- TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
- .addReg(CountReg)
- .addImm(0);
- auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
- .addReg(CountReg, RegState::Kill)
- .addImm(WavefrontSize);
- if (LV)
- LV->getVarInfo(CountReg).Kills.push_back(CmpMI);
- auto CmovMI =
- BuildMI(*MBB, FirstMI, DL,
- TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
- Exec)
- .addImm(-1);
-
- if (!LIS) {
- MI.eraseFromParent();
- return;
- }
-
- LIS->RemoveMachineInstrFromMaps(MI);
- MI.eraseFromParent();
-
- LIS->InsertMachineInstrInMaps(*BfeMI);
- LIS->InsertMachineInstrInMaps(*BfmMI);
- LIS->InsertMachineInstrInMaps(*CmpMI);
- LIS->InsertMachineInstrInMaps(*CmovMI);
-
- RecomputeRegs.insert(InputReg);
- LIS->createAndComputeVirtRegInterval(CountReg);
-}
-
bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
for (auto &I : MBB.instrs()) {
if (!I.isDebugInstr() && !I.isUnconditionalBranch())
@@ -927,18 +836,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
SplitMBB = process(MI);
Changed = true;
break;
-
- // FIXME: find a better place for this
- case AMDGPU::SI_INIT_EXEC:
- case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
- lowerInitExec(MBB, MI);
- if (LIS)
- LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
- Changed = true;
- break;
-
- default:
- break;
}
if (SplitMBB != MBB) {
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 09dc1c781e2f3..f57faa86e90ca 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -177,6 +177,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
SmallVector<MachineInstr *, 4> LowerToMovInstrs;
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
SmallVector<MachineInstr *, 4> KillInstrs;
+ SmallVector<MachineInstr *, 4> InitExecInstrs;
void printInfo();
@@ -223,6 +224,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
void lowerLiveMaskQueries();
void lowerCopyInstrs();
void lowerKillInstrs(bool IsWQM);
+ void lowerInitExec(MachineInstr &MI);
+ void lowerInitExecInstrs();
public:
static char ID;
@@ -580,6 +583,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
Opcode == AMDGPU::SI_DEMOTE_I1) {
KillInstrs.push_back(&MI);
BBI.NeedsLowering = true;
+ } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
+ Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
+ InitExecInstrs.push_back(&MI);
} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical
// VGPRs correspond to shader inputs and outputs. Inputs are
@@ -1556,6 +1562,97 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
}
}
+void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
+ MachineBasicBlock *MBB = MI.getParent();
+ bool IsWave32 = ST->isWave32();
+
+ if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
+ // This should be before all vector instructions.
+ MachineInstr *InitMI =
+ BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+ TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
+ Exec)
+ .addImm(MI.getOperand(0).getImm());
+ if (LIS) {
+ LIS->RemoveMachineInstrFromMaps(MI);
+ LIS->InsertMachineInstrInMaps(*InitMI);
+ }
+ MI.eraseFromParent();
+ return;
+ }
+
+ // Extract the thread count from an SGPR input and set EXEC accordingly.
+ // Since BFM can't shift by 64, handle that case with CMP + CMOV.
+ //
+ // S_BFE_U32 count, input, {shift, 7}
+ // S_BFM_B64 exec, count, 0
+ // S_CMP_EQ_U32 count, 64
+ // S_CMOV_B64 exec, -1
+ Register InputReg = MI.getOperand(0).getReg();
+ MachineInstr *FirstMI = &*MBB->begin();
+ if (InputReg.isVirtual()) {
+ MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
+ assert(DefInstr && DefInstr->isCopy());
+ if (DefInstr->getParent() == MBB) {
+ if (DefInstr != FirstMI) {
+ // If the `InputReg` is defined in current block, we also need to
+ // move that instruction to the beginning of the block.
+ DefInstr->removeFromParent();
+ MBB->insert(FirstMI, DefInstr);
+ if (LIS)
+ LIS->handleMove(*DefInstr);
+ } else {
+ // If first instruction is definition then move pointer after it.
+ FirstMI = &*std::next(FirstMI->getIterator());
+ }
+ }
+ }
+
+ // Insert instruction sequence at block beginning (before vector operations).
+ const DebugLoc DL = MI.getDebugLoc();
+ const unsigned WavefrontSize = ST->getWavefrontSize();
+ const unsigned Mask = (WavefrontSize << 1) - 1;
+ Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
+ .addReg(InputReg)
+ .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
+ auto BfmMI =
+ BuildMI(*MBB, FirstMI, DL,
+ TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
+ .addReg(CountReg)
+ .addImm(0);
+ auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
+ .addReg(CountReg, RegState::Kill)
+ .addImm(WavefrontSize);
+ auto CmovMI =
+ BuildMI(*MBB, FirstMI, DL,
+ TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
+ Exec)
+ .addImm(-1);
+
+ if (!LIS) {
+ MI.eraseFromParent();
+ return;
+ }
+
+ LIS->RemoveMachineInstrFromMaps(MI);
+ MI.eraseFromParent();
+
+ LIS->InsertMachineInstrInMaps(*BfeMI);
+ LIS->InsertMachineInstrInMaps(*BfmMI);
+ LIS->InsertMachineInstrInMaps(*CmpMI);
+ LIS->InsertMachineInstrInMaps(*CmovMI);
+
+ LIS->removeInterval(InputReg);
+ LIS->createAndComputeVirtRegInterval(InputReg);
+ LIS->createAndComputeVirtRegInterval(CountReg);
+}
+
+void SIWholeQuadMode::lowerInitExecInstrs() {
+ for (MachineInstr *MI : InitExecInstrs)
+ lowerInitExec(*MI);
+}
+
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
<< " ------------- \n");
@@ -1567,6 +1664,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LowerToCopyInstrs.clear();
LowerToMovInstrs.clear();
KillInstrs.clear();
+ InitExecInstrs.clear();
StateTransition.clear();
ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1605,11 +1703,14 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
// Shader is simple does not need any state changes or any complex lowering
if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
- LowerToMovInstrs.empty() && KillInstrs.empty()) {
+ LowerToMovInstrs.empty() && KillInstrs.empty() &&
+ InitExecInstrs.empty()) {
lowerLiveMaskQueries();
return !LiveMaskQueries.empty();
}
+ lowerInitExecInstrs();
+
MachineBasicBlock &Entry = MF.front();
MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
>From 543cc94d01a7c3e9d1cee256acaead1cd5c55272 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 5 Jun 2024 14:17:42 +0100
Subject: [PATCH 2/2] Add lowerInitExecInstrs to fast path
---
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index f57faa86e90ca..5b4c44302fa62 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -1703,10 +1703,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
// Shader is simple does not need any state changes or any complex lowering
if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
- LowerToMovInstrs.empty() && KillInstrs.empty() &&
- InitExecInstrs.empty()) {
+ LowerToMovInstrs.empty() && KillInstrs.empty()) {
+ lowerInitExecInstrs();
lowerLiveMaskQueries();
- return !LiveMaskQueries.empty();
+ return !InitExecInstrs.empty() || !LiveMaskQueries.empty();
}
lowerInitExecInstrs();
More information about the llvm-commits
mailing list