[llvm] [AMDGPU][SIInsertWaitcnts] Do not add s_waitcnt when the counters are known to be 0 already (PR #72830)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 19 23:59:14 PST 2023
Juan Manuel MARTINEZ =?utf-8?q?CAAMAÑO?= <juamarti at amd.com>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/72830 at github.com>
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
@llvm/pr-subscribers-backend-amdgpu
Author: Pierre van Houtryve (Pierre-vh)
<details>
<summary>Changes</summary>
See #<!-- -->72829 to precommit test changes.
Original patch by @<!-- -->jmmartinez : #<!-- -->65735
---
Patch is 3.52 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/72830.diff
67 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp (+5-1)
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+75-61)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+5)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+25)
- (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+7-5)
- (modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+15)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll (+56-281)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll (+58-299)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll (+2-7)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+7-38)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll (+1272-482)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll (-20)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (-120)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+4-252)
- (modified) llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll (+1-14)
- (modified) llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll (-3)
- (modified) llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll (+2-6)
- (modified) llvm/test/CodeGen/AMDGPU/fence-barrier.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll (-1)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics.ll (-423)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll (-297)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (-246)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll (-297)
- (modified) llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll (+2-7)
- (modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+7-38)
- (modified) llvm/test/CodeGen/AMDGPU/gds-allocation.ll (+4-9)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll (+12-53)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll (-160)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll (+4-474)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (-20)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-store.ll (+16-56)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics.ll (+8-390)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll (+36-315)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll (+3-324)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll (+36-315)
- (modified) llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll (-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll (+4-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll (-28)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll (-42)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll (-316)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll (+1-841)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll (+1-841)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll (-8)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll (-492)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll (+16-885)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll (+16-825)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll (-9)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll (+8-566)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll (-390)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll (-390)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll (-9)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll (-390)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll (-2)
- (modified) llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll (-2)
- (added) llvm/test/CodeGen/AMDGPU/preserve-user-waitcnt.ll (+53)
- (modified) llvm/test/CodeGen/AMDGPU/release-vgprs.mir (+56-47)
- (modified) llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll (-1)
- (modified) llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir (+17-16)
- (modified) llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/waitcnt.mir (+2-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index bf65be3fe9035e7..c8ce1903d31537c 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -25,10 +25,12 @@ void AMDGPUInstrPostProcess::postProcessInstruction(
std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
switch (MCI.getOpcode()) {
case AMDGPU::S_WAITCNT:
+ case AMDGPU::S_WAITCNT_soft:
case AMDGPU::S_WAITCNT_EXPCNT:
case AMDGPU::S_WAITCNT_LGKMCNT:
case AMDGPU::S_WAITCNT_VMCNT:
case AMDGPU::S_WAITCNT_VSCNT:
+ case AMDGPU::S_WAITCNT_VSCNT_soft:
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
@@ -77,10 +79,12 @@ unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
default:
return 0;
case AMDGPU::S_WAITCNT: // This instruction
+ case AMDGPU::S_WAITCNT_soft:
case AMDGPU::S_WAITCNT_EXPCNT:
case AMDGPU::S_WAITCNT_LGKMCNT:
case AMDGPU::S_WAITCNT_VMCNT:
- case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
+ case AMDGPU::S_WAITCNT_VSCNT:
+ case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7d..7048aee3099d166 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -292,6 +292,13 @@ class WaitcntBrackets {
VgprVmemTypes[GprNo] = 0;
}
+ void setNonKernelFunctionInitialState() {
+ for (InstCounterType Counter : inst_counter_types()) {
+ setScoreUB(Counter, getWaitCountMax(Counter));
+ PendingEvents |= WaitEventMaskForInst[Counter];
+ }
+ }
+
void print(raw_ostream &);
void dump() { print(dbgs()); }
@@ -364,7 +371,6 @@ class SIInsertWaitcnts : public MachineFunctionPass {
const MachineRegisterInfo *MRI = nullptr;
AMDGPU::IsaVersion IV;
- DenseSet<MachineInstr *> TrackedWaitcntSet;
DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
MachineLoopInfo *MLI;
@@ -477,7 +483,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
bool generateWaitcnt(AMDGPU::Waitcnt Wait,
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr);
+ MachineInstr *OldWaitcntInstr) const;
void updateEventWaitcntAfter(MachineInstr &Inst,
WaitcntBrackets *ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
@@ -486,6 +492,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
MachineInstr &OldWaitcntInstr,
AMDGPU::Waitcnt &Wait,
MachineBasicBlock::instr_iterator It) const;
+ bool updateWaitcntIfSoft(MachineInstr *Waitcnt) const;
};
} // end anonymous namespace
@@ -870,6 +877,15 @@ static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
return true;
}
+bool SIInsertWaitcnts::updateWaitcntIfSoft(MachineInstr *Waitcnt) const {
+ unsigned Opcode = Waitcnt->getOpcode();
+ if (!SIInstrInfo::isSoftWaitcnt(Opcode))
+ return false;
+
+ Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode)));
+ return true;
+}
+
/// Combine consecutive waitcnt instructions that precede \p It and follow
/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
/// by previous passes. Currently this pass conservatively assumes that these
@@ -886,18 +902,22 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
if (II.isMetaInstruction())
continue;
- if (II.getOpcode() == AMDGPU::S_WAITCNT) {
+ unsigned Opcode = II.getOpcode();
+ bool CanFullyDiscardWaitcntSequence = SIInstrInfo::isSoftWaitcnt(Opcode);
+
+ if (SIInstrInfo::isWaitcnt(Opcode)) {
// Conservatively update required wait if this waitcnt was added in an
// earlier pass. In this case it will not exist in the tracked waitcnt
// set.
- if (!TrackedWaitcntSet.count(&II)) {
- unsigned IEnc = II.getOperand(0).getImm();
- AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
- Wait = Wait.combined(OldWait);
- }
+ unsigned IEnc = II.getOperand(0).getImm();
+ AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
+ if (CanFullyDiscardWaitcntSequence)
+ ScoreBrackets.simplifyWaitcnt(OldWait);
+ Wait = Wait.combined(OldWait);
// Merge consecutive waitcnt of the same type by erasing multiples.
- if (!WaitcntInstr) {
+ if (!WaitcntInstr &&
+ (Wait.hasWaitExceptVsCnt() || !CanFullyDiscardWaitcntSequence)) {
WaitcntInstr = &II;
} else {
II.eraseFromParent();
@@ -905,15 +925,17 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
}
} else {
- assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+ assert(SIInstrInfo::isWaitcntVsCnt(Opcode));
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
- if (!TrackedWaitcntSet.count(&II)) {
- unsigned OldVSCnt =
- TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
- Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
- }
- if (!WaitcntVsCntInstr) {
+ unsigned OldVSCnt =
+ TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ if (CanFullyDiscardWaitcntSequence)
+ ScoreBrackets.simplifyWaitcnt(InstCounterType::VS_CNT, OldVSCnt);
+ Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
+
+ if (!WaitcntVsCntInstr &&
+ (Wait.hasWaitVsCnt() || !CanFullyDiscardWaitcntSequence)) {
WaitcntVsCntInstr = &II;
} else {
II.eraseFromParent();
@@ -924,48 +946,38 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
// Updated encoding of merged waitcnt with the required wait.
if (WaitcntInstr) {
- if (Wait.hasWaitExceptVsCnt()) {
- Modified |=
- updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
- AMDGPU::encodeWaitcnt(IV, Wait));
- ScoreBrackets.applyWaitcnt(Wait);
- Wait.VmCnt = ~0u;
- Wait.LgkmCnt = ~0u;
- Wait.ExpCnt = ~0u;
-
- LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
- ? dbgs() << "applyPreexistingWaitcnt\n"
- << "New Instr at block end: " << *WaitcntInstr
- << '\n'
- : dbgs() << "applyPreexistingWaitcnt\n"
- << "Old Instr: " << *It
- << "New Instr: " << *WaitcntInstr << '\n');
+ Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
+ AMDGPU::encodeWaitcnt(IV, Wait));
+ Modified |= updateWaitcntIfSoft(WaitcntInstr);
- } else {
- WaitcntInstr->eraseFromParent();
- Modified = true;
- }
+ ScoreBrackets.applyWaitcnt(Wait);
+ Wait.VmCnt = ~0u;
+ Wait.LgkmCnt = ~0u;
+ Wait.ExpCnt = ~0u;
+
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs()
+ << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: " << *WaitcntInstr << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It
+ << "New Instr: " << *WaitcntInstr << '\n');
}
if (WaitcntVsCntInstr) {
- if (Wait.hasWaitVsCnt()) {
- assert(ST->hasVscnt());
- Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
- AMDGPU::OpName::simm16, Wait.VsCnt);
- ScoreBrackets.applyWaitcnt(Wait);
- Wait.VsCnt = ~0u;
-
- LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
- ? dbgs() << "applyPreexistingWaitcnt\n"
- << "New Instr at block end: "
- << *WaitcntVsCntInstr << '\n'
- : dbgs() << "applyPreexistingWaitcnt\n"
- << "Old Instr: " << *It
- << "New Instr: " << *WaitcntVsCntInstr << '\n');
- } else {
- WaitcntVsCntInstr->eraseFromParent();
- Modified = true;
- }
+ Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
+ AMDGPU::OpName::simm16, Wait.VsCnt);
+ Modified |= updateWaitcntIfSoft(WaitcntVsCntInstr);
+ ScoreBrackets.applyWaitcnt(Wait);
+ Wait.VsCnt = ~0u;
+
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs() << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: " << *WaitcntVsCntInstr
+ << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It
+ << "New Instr: " << *WaitcntVsCntInstr << '\n');
}
return Modified;
@@ -1284,7 +1296,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr) {
+ MachineInstr *OldWaitcntInstr) const {
bool Modified = false;
const DebugLoc &DL = Block.findDebugLoc(It);
@@ -1317,7 +1329,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
auto SWaitInst =
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
- TrackedWaitcntSet.insert(SWaitInst);
Modified = true;
LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
@@ -1331,7 +1342,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
.addImm(Wait.VsCnt);
- TrackedWaitcntSet.insert(SWaitInst);
Modified = true;
LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
@@ -1574,9 +1584,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
}
static bool isWaitInstr(MachineInstr &Inst) {
- return Inst.getOpcode() == AMDGPU::S_WAITCNT ||
- (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
- Inst.getOperand(0).isReg() &&
+ auto Opcode = Inst.getOpcode();
+ return SIInstrInfo::isWaitcnt(Opcode) ||
+ (SIInstrInfo::isWaitcntVsCnt(Opcode) && Inst.getOperand(0).isReg() &&
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
}
@@ -1845,7 +1855,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
- TrackedWaitcntSet.clear();
BlockInfos.clear();
bool Modified = false;
@@ -1863,6 +1872,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
;
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
+ auto NonKernelInitialState =
+ std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
+ NonKernelInitialState->setNonKernelFunctionInitialState();
+ BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
+
Modified = true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c4baabcd9232b56..7ccaea823a2295c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8774,6 +8774,11 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
}
int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
+
+ // FIXME: move to the right place
+ if (SIInstrInfo::isSoftWaitcnt(Opcode))
+ Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
+
unsigned Gen = subtargetEncodingFamily(ST);
if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index de2820e5c013ee3..fb11d30dbca6dca 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -859,6 +859,31 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return get(Opcode).TSFlags & SIInstrFlags::TiedSourceNotRead;
}
+ static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) {
+ if (isWaitcnt(Opcode))
+ return AMDGPU::S_WAITCNT;
+
+ if (isWaitcntVsCnt(Opcode))
+ return AMDGPU::S_WAITCNT_VSCNT;
+
+ llvm_unreachable("Expected opcode S_WAITCNT/S_WAITCNT_VSCNT");
+ }
+
+ static bool isWaitcnt(unsigned Opcode) {
+ return Opcode == AMDGPU::S_WAITCNT || Opcode == AMDGPU::S_WAITCNT_soft;
+ }
+
+ static bool isWaitcntVsCnt(unsigned Opcode) {
+ return Opcode == AMDGPU::S_WAITCNT_VSCNT ||
+ Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
+ }
+
+ // soft waitcnt instructions can be relaxed/optimized out by SIInsertWaitcnts
+ static bool isSoftWaitcnt(unsigned Opcode) {
+ return Opcode == AMDGPU::S_WAITCNT_soft ||
+ Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
+ }
+
bool isVGPRCopy(const MachineInstr &MI) const {
assert(isCopyInstr(MI));
Register Dest = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index bc48f7b76c6d787..10ec54d3317fdf1 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1055,7 +1055,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
VMCnt ? 0 : getVmcntBitMask(IV),
getExpcntBitMask(IV),
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
+ .addImm(WaitCntImmediate);
Changed = true;
}
@@ -1963,14 +1964,15 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
VMCnt ? 0 : getVmcntBitMask(IV),
getExpcntBitMask(IV),
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
+ .addImm(WaitCntImmediate);
Changed = true;
}
if (VSCnt) {
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
- .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(0);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
Changed = true;
}
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 90056e6ca281e78..83b325a148a168a 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1464,6 +1464,21 @@ def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins SWaitCnt:$simm16), "$simm16",
[(int_amdgcn_s_waitcnt timm:$simm16)]>;
+
+// "_soft" waitcnts are waitcnts that are either relaxed into their non-soft
+// counterpart, or completely removed.
+//
+// These are inserted by to resolve memory dependencies by the memory
+// legalizer and later optimized by SIInsertWaitcnts
+// For example, a S_WAITCNT_soft 0 can be completely removed on a function
+// that doesn't access memory.
+def S_WAITCNT_soft : SOPP_Pseudo <"s_soft_waitcnt" , (ins SWaitCnt:$simm16), "$simm16">;
+def S_WAITCNT_VSCNT_soft : SOPP_Pseudo<"s_soft_waitcnt_vscnt", (ins SReg_32:$sdst, s16imm:$simm16), "$sdst, $simm16"> {
+ let mayLoad = 1;
+ let mayStore = 1;
+ let has_sdst = 1;
+}
+
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index feb65a5210d59d2..25cee87244975e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -22,7 +22,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
-; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -38,7 +37,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -53,7 +51,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; GFX9-NEXT: v_mov_b32_e32 v1, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_dec_rtn_u32 v0, v0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@@ -67,8 +64,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: ds_dec_rtn_u32 v0, v0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
@@ -83,8 +79,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_dec_rtn_u32 v0, v0, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
@@ -107,7 +101,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
-; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -123,7 +116,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -138,7 +130,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@@ -152,8 +143,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/72830
More information about the llvm-commits
mailing list