[llvm] [AMDGPU] Fix interaction between WQM and llvm.amdgcn.init.exec (PR #93680)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Wed May 29 06:15:50 PDT 2024
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/93680
- **[AMDGPU] Reduce use of continue in SIWholeQuadMode. NFC.**
- **[AMDGPU] Move INIT_EXEC lowering from SILowerControlFlow to SIWholeQuadMode**
- **[AMDGPU] New test for WQM and llvm.amdgcn.init.exec**
- **[AMDGPU] Fix interaction between WQM and llvm.amdgcn.init.exec**
>From d9d4ba67399ff0e7bb2604a51758d0ab3c5ff770 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 29 May 2024 10:42:40 +0100
Subject: [PATCH 1/4] [AMDGPU] Reduce use of continue in SIWholeQuadMode. NFC.
---
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 92 +++++++++-------------
1 file changed, 36 insertions(+), 56 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index ea8109bbee9ae..09dc1c781e2f3 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -278,11 +278,10 @@ LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
for (const MachineInstr &MI : *BII.first) {
auto III = Instructions.find(&MI);
- if (III == Instructions.end())
- continue;
-
- dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
- << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
+ if (III != Instructions.end()) {
+ dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
+ << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
+ }
}
}
}
@@ -455,10 +454,8 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI,
for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
LiveRange &LR = LIS->getRegUnit(Unit);
const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
- if (!Value)
- continue;
-
- markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
+ if (Value)
+ markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
}
}
}
@@ -499,19 +496,16 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
if (TII->isWQM(Opcode)) {
// If LOD is not supported WQM is not needed.
- if (!ST->hasExtendedImageInsts())
- continue;
// Only generate implicit WQM if implicit derivatives are required.
// This avoids inserting unintended WQM if a shader type without
// implicit derivatives uses an image sampling instruction.
- if (!HasImplicitDerivatives)
- continue;
- // Sampling instructions don't need to produce results for all pixels
- // in a quad, they just require all inputs of a quad to have been
- // computed for derivatives.
- markInstructionUses(MI, StateWQM, Worklist);
- GlobalFlags |= StateWQM;
- continue;
+ if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
+ // Sampling instructions don't need to produce results for all pixels
+ // in a quad, they just require all inputs of a quad to have been
+ // computed for derivatives.
+ markInstructionUses(MI, StateWQM, Worklist);
+ GlobalFlags |= StateWQM;
+ }
} else if (Opcode == AMDGPU::WQM) {
// The WQM intrinsic requires its output to have all the helper lanes
// correct, so we need it to be in WQM.
@@ -520,7 +514,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
} else if (Opcode == AMDGPU::SOFT_WQM) {
LowerToCopyInstrs.push_back(&MI);
SoftWQMInstrs.push_back(&MI);
- continue;
} else if (Opcode == AMDGPU::STRICT_WWM) {
// The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
// it needs to be executed in WQM or Exact so that its copy doesn't
@@ -528,7 +521,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
markInstructionUses(MI, StateStrictWWM, Worklist);
GlobalFlags |= StateStrictWWM;
LowerToMovInstrs.push_back(&MI);
- continue;
} else if (Opcode == AMDGPU::STRICT_WQM ||
TII->isDualSourceBlendEXP(MI)) {
// STRICT_WQM is similar to STRICTWWM, but instead of enabling all
@@ -551,7 +543,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
GlobalFlags |= StateExact;
III.Disabled = StateWQM | StateStrict;
}
- continue;
} else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
Opcode == AMDGPU::DS_PARAM_LOAD ||
Opcode == AMDGPU::LDS_DIRECT_LOAD ||
@@ -561,7 +552,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
InstrInfo &II = Instructions[&MI];
II.Needs |= StateStrictWQM;
GlobalFlags |= StateStrictWQM;
- continue;
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
Opcode == AMDGPU::V_SET_INACTIVE_B64) {
III.Disabled = StateStrict;
@@ -574,7 +564,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
}
}
SetInactiveInstrs.push_back(&MI);
- continue;
} else if (TII->isDisableWQM(MI)) {
BBI.Needs |= StateExact;
if (!(BBI.InNeeds & StateExact)) {
@@ -583,40 +572,33 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
}
GlobalFlags |= StateExact;
III.Disabled = StateWQM | StateStrict;
- continue;
- } else {
- if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
- LiveMaskQueries.push_back(&MI);
- } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
- Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
- Opcode == AMDGPU::SI_DEMOTE_I1) {
- KillInstrs.push_back(&MI);
- BBI.NeedsLowering = true;
- } else if (WQMOutputs) {
- // The function is in machine SSA form, which means that physical
- // VGPRs correspond to shader inputs and outputs. Inputs are
- // only used, outputs are only defined.
- // FIXME: is this still valid?
- for (const MachineOperand &MO : MI.defs()) {
- if (!MO.isReg())
- continue;
-
- Register Reg = MO.getReg();
-
- if (!Reg.isVirtual() &&
- TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
- Flags = StateWQM;
- break;
- }
+ } else if (Opcode == AMDGPU::SI_PS_LIVE ||
+ Opcode == AMDGPU::SI_LIVE_MASK) {
+ LiveMaskQueries.push_back(&MI);
+ } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
+ Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
+ Opcode == AMDGPU::SI_DEMOTE_I1) {
+ KillInstrs.push_back(&MI);
+ BBI.NeedsLowering = true;
+ } else if (WQMOutputs) {
+ // The function is in machine SSA form, which means that physical
+ // VGPRs correspond to shader inputs and outputs. Inputs are
+ // only used, outputs are only defined.
+ // FIXME: is this still valid?
+ for (const MachineOperand &MO : MI.defs()) {
+ Register Reg = MO.getReg();
+ if (Reg.isPhysical() &&
+ TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
+ Flags = StateWQM;
+ break;
}
}
-
- if (!Flags)
- continue;
}
- markInstruction(MI, Flags, Worklist);
- GlobalFlags |= Flags;
+ if (Flags) {
+ markInstruction(MI, Flags, Worklist);
+ GlobalFlags |= Flags;
+ }
}
}
@@ -1568,8 +1550,6 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
SplitPoint = lowerKillF32(*MBB, *MI);
break;
- default:
- continue;
}
if (SplitPoint)
splitBlock(MBB, SplitPoint);
>From 5f5282e328cb2843728f55195a79559f4e5728f7 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 29 May 2024 13:04:52 +0100
Subject: [PATCH 2/4] [AMDGPU] Move INIT_EXEC lowering from SILowerControlFlow
to SIWholeQuadMode
NFCI; this just preserves SI_INIT_EXEC and SI_INIT_EXEC_FROM_INPUT
instructions a little longer so that we can reliably identify them in
SIWholeQuadMode.
---
llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 103 ------------------
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 103 +++++++++++++++++-
2 files changed, 102 insertions(+), 104 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index f178324dbbe24..5dc3457b5bfae 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -103,8 +103,6 @@ class SILowerControlFlow : public MachineFunctionPass {
MachineBasicBlock *emitEndCf(MachineInstr &MI);
- void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
-
void findMaskOperands(MachineInstr &MI, unsigned OpNo,
SmallVectorImpl<MachineOperand> &Src) const;
@@ -709,95 +707,6 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
return SplitBB;
}
-void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
- MachineInstr &MI) {
- MachineFunction &MF = *MBB->getParent();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- bool IsWave32 = ST.isWave32();
-
- if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
- // This should be before all vector instructions.
- MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
- TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
- .addImm(MI.getOperand(0).getImm());
- if (LIS) {
- LIS->RemoveMachineInstrFromMaps(MI);
- LIS->InsertMachineInstrInMaps(*InitMI);
- }
- MI.eraseFromParent();
- return;
- }
-
- // Extract the thread count from an SGPR input and set EXEC accordingly.
- // Since BFM can't shift by 64, handle that case with CMP + CMOV.
- //
- // S_BFE_U32 count, input, {shift, 7}
- // S_BFM_B64 exec, count, 0
- // S_CMP_EQ_U32 count, 64
- // S_CMOV_B64 exec, -1
- Register InputReg = MI.getOperand(0).getReg();
- MachineInstr *FirstMI = &*MBB->begin();
- if (InputReg.isVirtual()) {
- MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
- assert(DefInstr && DefInstr->isCopy());
- if (DefInstr->getParent() == MBB) {
- if (DefInstr != FirstMI) {
- // If the `InputReg` is defined in current block, we also need to
- // move that instruction to the beginning of the block.
- DefInstr->removeFromParent();
- MBB->insert(FirstMI, DefInstr);
- if (LIS)
- LIS->handleMove(*DefInstr);
- } else {
- // If first instruction is definition then move pointer after it.
- FirstMI = &*std::next(FirstMI->getIterator());
- }
- }
- }
-
- // Insert instruction sequence at block beginning (before vector operations).
- const DebugLoc DL = MI.getDebugLoc();
- const unsigned WavefrontSize = ST.getWavefrontSize();
- const unsigned Mask = (WavefrontSize << 1) - 1;
- Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
- .addReg(InputReg)
- .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
- if (LV)
- LV->recomputeForSingleDefVirtReg(InputReg);
- auto BfmMI =
- BuildMI(*MBB, FirstMI, DL,
- TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
- .addReg(CountReg)
- .addImm(0);
- auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
- .addReg(CountReg, RegState::Kill)
- .addImm(WavefrontSize);
- if (LV)
- LV->getVarInfo(CountReg).Kills.push_back(CmpMI);
- auto CmovMI =
- BuildMI(*MBB, FirstMI, DL,
- TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
- Exec)
- .addImm(-1);
-
- if (!LIS) {
- MI.eraseFromParent();
- return;
- }
-
- LIS->RemoveMachineInstrFromMaps(MI);
- MI.eraseFromParent();
-
- LIS->InsertMachineInstrInMaps(*BfeMI);
- LIS->InsertMachineInstrInMaps(*BfmMI);
- LIS->InsertMachineInstrInMaps(*CmpMI);
- LIS->InsertMachineInstrInMaps(*CmovMI);
-
- RecomputeRegs.insert(InputReg);
- LIS->createAndComputeVirtRegInterval(CountReg);
-}
-
bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
for (auto &I : MBB.instrs()) {
if (!I.isDebugInstr() && !I.isUnconditionalBranch())
@@ -927,18 +836,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
SplitMBB = process(MI);
Changed = true;
break;
-
- // FIXME: find a better place for this
- case AMDGPU::SI_INIT_EXEC:
- case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
- lowerInitExec(MBB, MI);
- if (LIS)
- LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
- Changed = true;
- break;
-
- default:
- break;
}
if (SplitMBB != MBB) {
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 09dc1c781e2f3..f57faa86e90ca 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -177,6 +177,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
SmallVector<MachineInstr *, 4> LowerToMovInstrs;
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
SmallVector<MachineInstr *, 4> KillInstrs;
+ SmallVector<MachineInstr *, 4> InitExecInstrs;
void printInfo();
@@ -223,6 +224,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
void lowerLiveMaskQueries();
void lowerCopyInstrs();
void lowerKillInstrs(bool IsWQM);
+ void lowerInitExec(MachineInstr &MI);
+ void lowerInitExecInstrs();
public:
static char ID;
@@ -580,6 +583,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
Opcode == AMDGPU::SI_DEMOTE_I1) {
KillInstrs.push_back(&MI);
BBI.NeedsLowering = true;
+ } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
+ Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
+ InitExecInstrs.push_back(&MI);
} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical
// VGPRs correspond to shader inputs and outputs. Inputs are
@@ -1556,6 +1562,97 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
}
}
+void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
+ MachineBasicBlock *MBB = MI.getParent();
+ bool IsWave32 = ST->isWave32();
+
+ if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
+ // This should be before all vector instructions.
+ MachineInstr *InitMI =
+ BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+ TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
+ Exec)
+ .addImm(MI.getOperand(0).getImm());
+ if (LIS) {
+ LIS->RemoveMachineInstrFromMaps(MI);
+ LIS->InsertMachineInstrInMaps(*InitMI);
+ }
+ MI.eraseFromParent();
+ return;
+ }
+
+ // Extract the thread count from an SGPR input and set EXEC accordingly.
+ // Since BFM can't shift by 64, handle that case with CMP + CMOV.
+ //
+ // S_BFE_U32 count, input, {shift, 7}
+ // S_BFM_B64 exec, count, 0
+ // S_CMP_EQ_U32 count, 64
+ // S_CMOV_B64 exec, -1
+ Register InputReg = MI.getOperand(0).getReg();
+ MachineInstr *FirstMI = &*MBB->begin();
+ if (InputReg.isVirtual()) {
+ MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
+ assert(DefInstr && DefInstr->isCopy());
+ if (DefInstr->getParent() == MBB) {
+ if (DefInstr != FirstMI) {
+ // If the `InputReg` is defined in current block, we also need to
+ // move that instruction to the beginning of the block.
+ DefInstr->removeFromParent();
+ MBB->insert(FirstMI, DefInstr);
+ if (LIS)
+ LIS->handleMove(*DefInstr);
+ } else {
+ // If first instruction is definition then move pointer after it.
+ FirstMI = &*std::next(FirstMI->getIterator());
+ }
+ }
+ }
+
+ // Insert instruction sequence at block beginning (before vector operations).
+ const DebugLoc DL = MI.getDebugLoc();
+ const unsigned WavefrontSize = ST->getWavefrontSize();
+ const unsigned Mask = (WavefrontSize << 1) - 1;
+ Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
+ .addReg(InputReg)
+ .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
+ auto BfmMI =
+ BuildMI(*MBB, FirstMI, DL,
+ TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
+ .addReg(CountReg)
+ .addImm(0);
+ auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
+ .addReg(CountReg, RegState::Kill)
+ .addImm(WavefrontSize);
+ auto CmovMI =
+ BuildMI(*MBB, FirstMI, DL,
+ TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
+ Exec)
+ .addImm(-1);
+
+ if (!LIS) {
+ MI.eraseFromParent();
+ return;
+ }
+
+ LIS->RemoveMachineInstrFromMaps(MI);
+ MI.eraseFromParent();
+
+ LIS->InsertMachineInstrInMaps(*BfeMI);
+ LIS->InsertMachineInstrInMaps(*BfmMI);
+ LIS->InsertMachineInstrInMaps(*CmpMI);
+ LIS->InsertMachineInstrInMaps(*CmovMI);
+
+ LIS->removeInterval(InputReg);
+ LIS->createAndComputeVirtRegInterval(InputReg);
+ LIS->createAndComputeVirtRegInterval(CountReg);
+}
+
+void SIWholeQuadMode::lowerInitExecInstrs() {
+ for (MachineInstr *MI : InitExecInstrs)
+ lowerInitExec(*MI);
+}
+
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
<< " ------------- \n");
@@ -1567,6 +1664,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LowerToCopyInstrs.clear();
LowerToMovInstrs.clear();
KillInstrs.clear();
+ InitExecInstrs.clear();
StateTransition.clear();
ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1605,11 +1703,14 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
// Shader is simple does not need any state changes or any complex lowering
if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
- LowerToMovInstrs.empty() && KillInstrs.empty()) {
+ LowerToMovInstrs.empty() && KillInstrs.empty() &&
+ InitExecInstrs.empty()) {
lowerLiveMaskQueries();
return !LiveMaskQueries.empty();
}
+ lowerInitExecInstrs();
+
MachineBasicBlock &Entry = MF.front();
MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
>From 2448b0e301d2fb3dc64e014ab521fb2fd66ac23f Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 29 May 2024 14:13:37 +0100
Subject: [PATCH 3/4] [AMDGPU] New test for WQM and llvm.amdgcn.init.exec
---
llvm/test/CodeGen/AMDGPU/wqm.ll | 46 +++++++++++++++++++++++++++++++++
1 file changed, 46 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 95dfb12c8dbae..15279fea91960 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3312,6 +3312,52 @@ main_body:
ret void
}
+; Test the interaction between wqm and llvm.amdgcn.init.exec.
+define amdgpu_gs void @wqm_init_exec() {
+; GFX9-W64-LABEL: wqm_init_exec:
+; GFX9-W64: ; %bb.0: ; %bb
+; GFX9-W64-NEXT: s_mov_b64 exec, -1
+; GFX9-W64-NEXT: s_mov_b32 s0, 0
+; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-W64-NEXT: s_mov_b32 s1, s0
+; GFX9-W64-NEXT: s_mov_b32 s2, s0
+; GFX9-W64-NEXT: s_mov_b32 s3, s0
+; GFX9-W64-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-W64-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-W64-NEXT: s_wqm_b64 exec, exec
+; GFX9-W64-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $exec
+; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-W64-NEXT: ds_write_b32 v0, v1
+; GFX9-W64-NEXT: s_endpgm
+;
+; GFX10-W32-LABEL: wqm_init_exec:
+; GFX10-W32: ; %bb.0: ; %bb
+; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
+; GFX10-W32-NEXT: s_mov_b32 exec_lo, -1
+; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-W32-NEXT: s_mov_b32 s0, 0
+; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-W32-NEXT: s_mov_b32 s2, s0
+; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1
+; GFX10-W32-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-W32-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-W32-NEXT: v_mov_b32_e32 v4, s0
+; GFX10-W32-NEXT: s_mov_b32 s1, s0
+; GFX10-W32-NEXT: s_mov_b32 s3, s0
+; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-W32-NEXT: ds_write_b32 v0, v4
+; GFX10-W32-NEXT: s_endpgm
+bb:
+ call void @llvm.amdgcn.init.exec(i64 -1)
+ call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
+ %i = call i32 @llvm.amdgcn.wqm.i32(i32 0)
+ store i32 %i, i32 addrspace(3)* null, align 4
+ ret void
+}
+
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
>From d996f0a4887a7e4dd6d179b64e575a04eafe8ff7 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 29 May 2024 13:55:55 +0100
Subject: [PATCH 4/4] [AMDGPU] Fix interaction between WQM and
llvm.amdgcn.init.exec
Whole quad mode requires inserting a copy of the initial EXEC mask. In a
function that also uses llvm.amdgcn.init.exec, insert the COPY after
initializing EXEC.
---
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 24 ++++++++++++++++------
llvm/test/CodeGen/AMDGPU/wqm.ll | 2 +-
2 files changed, 19 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index f57faa86e90ca..fe8ee0a4855c8 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -225,7 +225,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
void lowerCopyInstrs();
void lowerKillInstrs(bool IsWQM);
void lowerInitExec(MachineInstr &MI);
- void lowerInitExecInstrs();
+ MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry);
public:
static char ID;
@@ -1648,9 +1648,23 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
LIS->createAndComputeVirtRegInterval(CountReg);
}
-void SIWholeQuadMode::lowerInitExecInstrs() {
- for (MachineInstr *MI : InitExecInstrs)
+/// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
+/// for instructions that depend on EXEC.
+MachineBasicBlock::iterator
+SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) {
+ MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
+
+ for (MachineInstr *MI : InitExecInstrs) {
+ // Try to handle undefined cases gracefully:
+ // - multiple INIT_EXEC instructions
+ // - INIT_EXEC instructions not in the entry block
+ if (MI->getParent() == &Entry)
+ InsertPt = std::next(MI->getIterator());
+
lowerInitExec(*MI);
+ }
+
+ return InsertPt;
}
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
@@ -1709,10 +1723,8 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
return !LiveMaskQueries.empty();
}
- lowerInitExecInstrs();
-
MachineBasicBlock &Entry = MF.front();
- MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
+ MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry);
// Store a copy of the original live mask when required
if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 15279fea91960..f3d5e557a409e 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3334,8 +3334,8 @@ define amdgpu_gs void @wqm_init_exec() {
;
; GFX10-W32-LABEL: wqm_init_exec:
; GFX10-W32: ; %bb.0: ; %bb
-; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-W32-NEXT: s_mov_b32 exec_lo, -1
+; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_mov_b32 s0, 0
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
More information about the llvm-commits
mailing list