[llvm-branch-commits] [llvm] [AMDGPU] Add DS loop preheader flush (3/4) (PR #171948)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Dec 11 17:08:15 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: None (hidekisaito)
<details>
<summary>Changes</summary>
Add insertDSPreheaderFlushes() to insert S_WAIT_DSCNT 0 in loop preheaders when DS wait relaxation was applied.
Assisted-by: Cursor / claude-4.5-opus-high
Depends on https://github.com/llvm/llvm-project/pull/171944
---
Full diff: https://github.com/llvm/llvm-project/pull/171948.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+67)
- (modified) llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir (+4-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 777491fb58b80..28bc57ed2db4e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -605,6 +605,7 @@ class SIInsertWaitcnts {
std::optional<unsigned> getOptimalDSWaitCount(MachineBasicBlock *LoopHeader,
const MachineInstr &MI) const;
bool applyDSLoopWaitOpt(MachineInstr &MI, AMDGPU::Waitcnt &Wait);
+ bool insertDSPreheaderFlushes(MachineFunction &MF);
};
// This objects maintains the current score brackets of each wait counter, and
@@ -2904,6 +2905,68 @@ bool SIInsertWaitcnts::applyDSLoopWaitOpt(MachineInstr &MI,
return true;
}
+// Insert DS_CNT flush in preheaders of loops where DS wait relaxation was
+// applied. This is necessary because the relaxed wait counts inside the loop
+// are computed based on the DS loads issued at the end of the previous
+// iteration (via backedge), but the first iteration enters via the preheader.
+// We must ensure all DS loads from the preheader are complete before entering
+// the loop.
+bool SIInsertWaitcnts::insertDSPreheaderFlushes(MachineFunction &MF) {
+ bool Modified = false;
+
+ for (auto &[LoopHeader, Info] : LoopDSWaitOptCache) {
+ if (!Info.Valid || !Info.RelaxationApplied)
+ continue;
+
+ MachineLoop *ML = MLI->getLoopFor(LoopHeader);
+ if (!ML)
+ continue;
+
+ MachineBasicBlock *Preheader = ML->getLoopPreheader();
+ if (!Preheader)
+ continue;
+
+ // Insert s_wait_dscnt 0 at the end of the preheader (before the terminator)
+ MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator();
+ if (InsertPos == Preheader->end() && !Preheader->empty())
+ InsertPos = std::prev(Preheader->end());
+
+ // Check if there's already a DS wait at this position
+ bool NeedInsert = true;
+ if (InsertPos != Preheader->end() && InsertPos != Preheader->begin()) {
+ auto CheckPos = std::prev(InsertPos);
+ if (CheckPos->getOpcode() == AMDGPU::S_WAIT_DSCNT_soft ||
+ CheckPos->getOpcode() == AMDGPU::S_WAIT_DSCNT) {
+ if (CheckPos->getOperand(0).getImm() == 0)
+ NeedInsert = false;
+ else {
+ // Change existing wait to 0
+ CheckPos->getOperand(0).setImm(0);
+ NeedInsert = false;
+ Modified = true;
+ LLVM_DEBUG(dbgs() << "DS Loop Opt: Changed existing DS_CNT wait to 0"
+ << " in preheader ";
+ Preheader->printName(dbgs()); dbgs() << "\n");
+ }
+ }
+ }
+
+ if (NeedInsert) {
+ DebugLoc DL;
+ if (InsertPos != Preheader->end())
+ DL = InsertPos->getDebugLoc();
+ BuildMI(*Preheader, InsertPos, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft))
+ .addImm(0);
+ Modified = true;
+ LLVM_DEBUG(dbgs() << "DS Loop Opt: Inserted DS_CNT flush in preheader ";
+ Preheader->printName(dbgs()); dbgs() << " for loop at ";
+ LoopHeader->printName(dbgs()); dbgs() << "\n");
+ }
+ }
+
+ return Modified;
+}
+
// Return true if it is better to flush the vmcnt counter in the preheader of
// the given loop. We currently decide to flush in two situations:
// 1. The loop contains vmem store(s), no vmem load and at least one use of a
@@ -3250,6 +3313,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
}
}
}
+
+ // Insert DS_CNT flushes in preheaders of loops that had wait counts relaxed.
+ Modified |= insertDSPreheaderFlushes(MF);
+
ReleaseVGPRInsts.clear();
PreheadersToFlush.clear();
LoopDSWaitOptCache.clear();
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir
index 48fdabf255e6f..e6237338fda5b 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir
@@ -17,6 +17,7 @@
# DBG: Loop DS Wait Opt: Loop at bb.1 - 16 DS loads, 8 WMMA/MFMA, {{[0-9]+}} total insts, eligible
# DBG: Loop DS Wait Opt: Analyzed loop at bb.1 - 16 DS loads, HasBarrier=1, Valid=1
# DBG: DS Loop Opt: Relaxing DsCnt from 0 to 12 for:
+# DBG: DS Loop Opt: Inserted DS_CNT flush in preheader bb.0 for loop at bb.1
--- |
define amdgpu_kernel void @ds_loop_eligible() { ret void }
@@ -31,9 +32,10 @@ machineFunctionInfo:
isEntryFunction: true
waveLimiter: false
body: |
+ ; Check preheader: OPT adds S_WAIT_DSCNT 0 flush, NOOPT does not
; OPT: bb.0:
- ; OPT-NOT: S_WAIT_DSCNT
- ; OPT: S_BRANCH %bb.1
+ ; OPT: S_WAIT_DSCNT_soft 0
+ ; OPT-NEXT: S_BRANCH %bb.1
; NOOPT: bb.0:
; NOOPT-NOT: S_WAIT_DSCNT
``````````
</details>
https://github.com/llvm/llvm-project/pull/171948
More information about the llvm-branch-commits
mailing list