[llvm] [AMDGPU][NFC] Minor source cleanups in SIInsertWaitcnts (PR #181095)
Sameer Sahasrabuddhe via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 12 06:48:40 PST 2026
https://github.com/ssahasra updated https://github.com/llvm/llvm-project/pull/181095
>From 9283cf71b57ce421545139ce9d5309c8a9cabcb9 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Thu, 12 Feb 2026 12:12:42 +0530
Subject: [PATCH 1/3] [AMDGPU][NFC] Minor source cleanups in SIInsertWaitcnts
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 31 ++++++++++-----------
1 file changed, 14 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 111867583fde3..0541efe2e693b 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1002,6 +1002,15 @@ class WaitcntBrackets {
// State of all counters at each async mark encountered so far.
SmallVector<CounterValueArray> AsyncMarks;
+
+ // For each backedge in isolation, the algorithm reachs a fixed point after
+ // the first call to merge(). This is unchanged even with the AsyncMarks
+ // array because we call mergeScore just like the other cases.
+ //
+ // But in the rare pathological case, a nest of loops that pushes marks
+ // without waiting on any mark can cause AsyncMarks to grow very large. We cap
+ // it to a reasonable limit. We can tune this later or potentially introduce a
+ // user option to control the value.
static constexpr unsigned MaxAsyncMarks = 16;
// Track the upper bound score for async operations that are not part of a
@@ -1256,7 +1265,8 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
// counters too, so will need a map from instruction or event types to
// counter types.
if (Context->isAsyncLdsDmaWrite(Inst) && T == LOAD_CNT) {
- assert(!SIInstrInfo::usesASYNC_CNT(Inst));
+ assert(!SIInstrInfo::usesASYNC_CNT(Inst) &&
+ "unexpected GFX1250 instruction");
AsyncScore[T] = CurrScore;
}
@@ -1539,7 +1549,6 @@ AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
}
});
- AMDGPU::Waitcnt Wait;
if (AsyncMarks.size() == MaxAsyncMarks) {
// Enforcing MaxAsyncMarks here is unnecessary work because the size of
// MaxAsyncMarks is linear when traversing straightline code. But we do
@@ -1549,6 +1558,7 @@ AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
N = std::min(N, (unsigned)MaxAsyncMarks - 1);
}
+ AMDGPU::Waitcnt Wait;
if (AsyncMarks.size() <= N) {
LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
return Wait;
@@ -2952,15 +2962,6 @@ bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
// Determine maximum length needed after merging
auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size());
-
- // For each backedge in isolation, the algorithm reachs a fixed point after
- // the first call to merge(). This is unchanged even with the AsyncMarks
- // array because we call mergeScore just like the other cases.
- //
- // But in the rare pathological case, a nest of loops that pushes marks
- // without waiting on any mark can cause AsyncMarks to grow very large. We cap
- // it to a reasonable limit. We can tune this later or potentially introduce a
- // user option to control the value.
MaxSize = std::min(MaxSize, MaxAsyncMarks);
// Keep only the most recent marks within our limit.
@@ -2972,7 +2973,7 @@ bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
// pending async operations at this checkpoint" and acts as the identity
// element for max() during merging. We pad at the beginning since the marks
// need to be aligned in most-recent order.
- CounterValueArray ZeroMark{};
+ constexpr CounterValueArray ZeroMark{};
AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
LLVM_DEBUG({
@@ -2981,9 +2982,6 @@ bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
llvm::interleaveComma(Mark, dbgs());
dbgs() << '\n';
}
- });
-
- LLVM_DEBUG({
dbgs() << "Other marks:\n";
for (const auto &Mark : OtherMarks) {
llvm::interleaveComma(Mark, dbgs());
@@ -2997,8 +2995,7 @@ bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
unsigned OtherSize = OtherMarks.size();
unsigned OurSize = AsyncMarks.size();
unsigned MergeCount = std::min(OtherSize, OurSize);
- assert(OurSize == MaxSize);
- for (unsigned Idx = 1; Idx <= MergeCount; ++Idx) {
+ for (auto Idx : seq<unsigned>(1, MergeCount)) {
for (auto T : inst_counter_types(Context->MaxCounter)) {
StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
OtherMarks[OtherSize - Idx][T]);
>From 533eb5fb6963f1700a64ca080acc0d4c21d6fc3b Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Thu, 12 Feb 2026 13:19:52 +0530
Subject: [PATCH 2/3] integer range needs to be inclusive
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 0541efe2e693b..3fb8dee3a55c1 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2995,7 +2995,7 @@ bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
unsigned OtherSize = OtherMarks.size();
unsigned OurSize = AsyncMarks.size();
unsigned MergeCount = std::min(OtherSize, OurSize);
- for (auto Idx : seq<unsigned>(1, MergeCount)) {
+ for (auto Idx : seq_inclusive<unsigned>(1, MergeCount)) {
for (auto T : inst_counter_types(Context->MaxCounter)) {
StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
OtherMarks[OtherSize - Idx][T]);
>From 5d6c7e64d5de2d9bc2ab7af76667d5238db4575d Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Thu, 12 Feb 2026 20:18:30 +0530
Subject: [PATCH 3/3] Update llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Co-authored-by: Jay Foad <jay.foad at amd.com>
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 3fb8dee3a55c1..005d624971ee9 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1003,7 +1003,7 @@ class WaitcntBrackets {
// State of all counters at each async mark encountered so far.
SmallVector<CounterValueArray> AsyncMarks;
- // For each backedge in isolation, the algorithm reachs a fixed point after
+ // For each backedge in isolation, the algorithm reaches a fixed point after
// the first call to merge(). This is unchanged even with the AsyncMarks
// array because we call mergeScore just like the other cases.
//
More information about the llvm-commits
mailing list