[llvm] [AMDGPU][SIInsertWaitcnts][NFC] Move soft xcnt deletion to separate function (PR #181760)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 16 21:14:59 PST 2026
https://github.com/vporpo updated https://github.com/llvm/llvm-project/pull/181760
>From 6c8f9960116f7ea1f5fb034a282bc5edf0b6f07a Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vasileios.porpodas at amd.com>
Date: Mon, 16 Feb 2026 22:11:44 +0000
Subject: [PATCH 1/2] [AMDGPU][SIInsertWaitcnts][NFC] Move soft xcnt deletion
to separate function
This patch simplifies the logic of `insertWaitcntInBlock()` by moving
the code that removes the redundant soft xcnt instructions to a new function:
`removeRedundantSoftXcnts()`.
While doing so, this patch also cleans up the logic a bit by dropping the
AtomiRMWState and the corresponding functions.
This helps in several ways:
- insertWaitcntInBlock() will now do what its name suggests, i.e., only insert and not remove.
- it makes it clear that removal of softxcnts is orthogonal to insertion of waitcnts.
- we won't have to worry about both erased and new instruction in insertWaitcntInBlock()'s loop.
The change should be NFC.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 113 +++++++-------------
1 file changed, 41 insertions(+), 72 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 111867583fde3..069a714402d59 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -99,21 +99,6 @@ static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
}
}
-static bool isSoftXcnt(MachineInstr &MI) {
- return MI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft;
-}
-
-static bool isAtomicRMW(MachineInstr &MI) {
- return (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) && MI.mayLoad() &&
- MI.mayStore();
-}
-
-enum class AtomicRMWState {
- NewBlock, // Start of a new atomic RMW block
- InsideBlock, // Middle of an existing block
- NotInBlock // Not in an atomic RMW block
-};
-
/// Integer IDs used to track vector memory locations we may have to wait on.
/// Encoded as u16 chunks:
///
@@ -676,10 +661,11 @@ class SIInsertWaitcnts {
WaitcntBrackets &ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
+ /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
+ /// Legalizer. Returns true if block was modified.
+ bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
bool ExpertMode) const;
- AtomicRMWState getAtomicRMWState(MachineInstr &MI,
- AtomicRMWState PrevState) const;
const WaitEventSet &getWaitEvents(InstCounterType T) const {
return WCG->getWaitEvents(T);
}
@@ -3126,39 +3112,6 @@ void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
.addImm(EncodedReg);
}
-// Track back-to-back atomic RMW instructions, referred to as a block.
-//
-// Determines whether \p MI starts a new atomic RMW block, is inside
-// an existing block, or is outside of a block. A block is broken when a
-// CU-scoped memory op or an atomic store is encountered. ALU ops
-// and non-memory instructions don't break a block. The function returns
-// the new state after processing the current instruction based on
-// \p PrevState, the previously captured state.
-AtomicRMWState
-SIInsertWaitcnts::getAtomicRMWState(MachineInstr &MI,
- AtomicRMWState PrevState) const {
- if (isAtomicRMW(MI)) {
- // Transition from NotInBlock -> NewBlock -> InsideBlock.
- if (PrevState == AtomicRMWState::NotInBlock)
- return AtomicRMWState::NewBlock;
- if (PrevState == AtomicRMWState::NewBlock)
- return AtomicRMWState::InsideBlock;
-
- return PrevState;
- }
-
- // LDS memory operations don't break the block.
- if (TII->isDS(MI) || (TII->isFLAT(MI) && TII->mayAccessLDSThroughFlat(MI)))
- return PrevState;
-
- // Reset the atomic RMW block state when found other VMEM and SMEM operations.
- if (MI.mayLoad() ^ MI.mayStore())
- return AtomicRMWState::NotInBlock;
-
- // Return the previous state otherwise.
- return PrevState;
-}
-
// Generate s_waitcnt instructions where needed.
bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
MachineBasicBlock &Block,
@@ -3187,7 +3140,6 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Walk over the instructions.
MachineInstr *OldWaitcntInstr = nullptr;
- AtomicRMWState RMWState = AtomicRMWState::NotInBlock;
for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
E = Block.instr_end();
@@ -3197,32 +3149,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
++Iter;
continue;
}
- // Get the atomic RMW block state for current instruction.
- RMWState = getAtomicRMWState(Inst, RMWState);
-
// Track pre-existing waitcnts that were added in earlier iterations or by
// the memory legalizer.
if (isWaitInstr(Inst) ||
(IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
- ++Iter;
- bool IsSoftXcnt = isSoftXcnt(Inst);
- // The Memory Legalizer conservatively inserts a soft xcnt before each
- // atomic RMW operation. However, for sequences of back-to-back atomic
- // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
- // the redundant soft xcnts when we're inside an atomic RMW block.
- if (Iter != E && IsSoftXcnt) {
- // Check if the next instruction can potentially change the atomic RMW
- // state.
- RMWState = getAtomicRMWState(*Iter, RMWState);
- }
-
- if (IsSoftXcnt && RMWState == AtomicRMWState::InsideBlock) {
- // Delete this soft xcnt.
- Inst.eraseFromParent();
- Modified = true;
- } else if (!OldWaitcntInstr) {
+ if (!OldWaitcntInstr)
OldWaitcntInstr = &Inst;
- }
+ ++Iter;
continue;
}
@@ -3344,6 +3277,41 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
return Modified;
}
+bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {
+ if (Block.size() <= 1)
+ return false;
+ // The Memory Legalizer conservatively inserts a soft xcnt before each
+ // atomic RMW operation. However, for sequences of back-to-back atomic
+ // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
+ // the redundant soft xcnts.
+ bool Modified = false;
+ // Remember the last atomic with a soft xcnt right before it.
+ MachineInstr *LastAtomicWithSoftXcnt = nullptr;
+
+ for (MachineInstr &MI : drop_begin(Block)) {
+ // Ignore last atomic if non-LDS VMEM and SMEM.
+ bool IsLDS =
+ TII->isDS(MI) || (TII->isFLAT(MI) && TII->mayAccessLDSThroughFlat(MI));
+ if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))
+ LastAtomicWithSoftXcnt = nullptr;
+
+ bool IsAtomicRMW = (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) &&
+ MI.mayLoad() && MI.mayStore();
+ MachineInstr &PrevMI = *MI.getPrevNode();
+ // This is an atomic with a soft xcnt.
+ if (PrevMI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
+ // If we have already found an atomic with a soft xcnt, remove this soft
+ // xcnt as it's redundant.
+ if (LastAtomicWithSoftXcnt) {
+ PrevMI.eraseFromParent();
+ Modified = true;
+ }
+ LastAtomicWithSoftXcnt = &MI;
+ }
+ }
+ return Modified;
+}
+
// Return flags indicating which counters should be flushed in the preheader.
PreheaderFlushFlags
SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
@@ -3668,6 +3636,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
}
}
+ Modified |= removeRedundantSoftXcnts(*MBB);
Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
BI.Dirty = false;
>From 6a3b5617bb86803cfff653b3ea6f924aa5eb7a06 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vasileios.porpodas at amd.com>
Date: Tue, 17 Feb 2026 05:12:04 +0000
Subject: [PATCH 2/2] fixup! [AMDGPU][SIInsertWaitcnts][NFC] Move soft xcnt
deletion to separate function
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 069a714402d59..c4aa6450c0c70 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -3636,7 +3636,9 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
}
}
- Modified |= removeRedundantSoftXcnts(*MBB);
+ if (ST->hasWaitXcnt()) {
+ Modified |= removeRedundantSoftXcnts(*MBB);
+ }
Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
BI.Dirty = false;
More information about the llvm-commits
mailing list