[llvm] [AMDGPU][NFC] Minor source cleanups in SIInsertWaitcnts (PR #181095)

Thu Feb 12 06:48:40 PST 2026

https://github.com/ssahasra updated https://github.com/llvm/llvm-project/pull/181095

>From 9283cf71b57ce421545139ce9d5309c8a9cabcb9 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Thu, 12 Feb 2026 12:12:42 +0530
Subject: [PATCH 1/3] [AMDGPU][NFC] Minor source cleanups in SIInsertWaitcnts

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 31 ++++++++++-----------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 111867583fde3..0541efe2e693b 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1002,6 +1002,15 @@ class WaitcntBrackets {
 
   // State of all counters at each async mark encountered so far.
   SmallVector<CounterValueArray> AsyncMarks;
+
+  // For each backedge in isolation, the algorithm reachs a fixed point after
+  // the first call to merge(). This is unchanged even with the AsyncMarks
+  // array because we call mergeScore just like the other cases.
+  //
+  // But in the rare pathological case, a nest of loops that pushes marks
+  // without waiting on any mark can cause AsyncMarks to grow very large. We cap
+  // it to a reasonable limit. We can tune this later or potentially introduce a
+  // user option to control the value.
   static constexpr unsigned MaxAsyncMarks = 16;
 
   // Track the upper bound score for async operations that are not part of a
@@ -1256,7 +1265,8 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
     // counters too, so will need a map from instruction or event types to
     // counter types.
     if (Context->isAsyncLdsDmaWrite(Inst) && T == LOAD_CNT) {
-      assert(!SIInstrInfo::usesASYNC_CNT(Inst));
+      assert(!SIInstrInfo::usesASYNC_CNT(Inst) &&
+             "unexpected GFX1250 instruction");
       AsyncScore[T] = CurrScore;
     }
 
@@ -1539,7 +1549,6 @@ AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
     }
   });
 
-  AMDGPU::Waitcnt Wait;
   if (AsyncMarks.size() == MaxAsyncMarks) {
     // Enforcing MaxAsyncMarks here is unnecessary work because the size of
     // MaxAsyncMarks is linear when traversing straightline code. But we do
@@ -1549,6 +1558,7 @@ AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
     N = std::min(N, (unsigned)MaxAsyncMarks - 1);
   }
 
+  AMDGPU::Waitcnt Wait;
   if (AsyncMarks.size() <= N) {
     LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
     return Wait;
@@ -2952,15 +2962,6 @@ bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
 
   // Determine maximum length needed after merging
   auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size());
-
-  // For each backedge in isolation, the algorithm reachs a fixed point after
-  // the first call to merge(). This is unchanged even with the AsyncMarks
-  // array because we call mergeScore just like the other cases.
-  //
-  // But in the rare pathological case, a nest of loops that pushes marks
-  // without waiting on any mark can cause AsyncMarks to grow very large. We cap
-  // it to a reasonable limit. We can tune this later or potentially introduce a
-  // user option to control the value.
   MaxSize = std::min(MaxSize, MaxAsyncMarks);
 
   // Keep only the most recent marks within our limit.
@@ -2972,7 +2973,7 @@ bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
   // pending async operations at this checkpoint" and acts as the identity
   // element for max() during merging. We pad at the beginning since the marks
   // need to be aligned in most-recent order.
-  CounterValueArray ZeroMark{};
+  constexpr CounterValueArray ZeroMark{};
   AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
 
   LLVM_DEBUG({
@@ -2981,9 +2982,6 @@ bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
       llvm::interleaveComma(Mark, dbgs());
       dbgs() << '\n';
     }
-  });
-
-  LLVM_DEBUG({
     dbgs() << "Other marks:\n";
     for (const auto &Mark : OtherMarks) {
       llvm::interleaveComma(Mark, dbgs());
@@ -2997,8 +2995,7 @@ bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
   unsigned OtherSize = OtherMarks.size();
   unsigned OurSize = AsyncMarks.size();
   unsigned MergeCount = std::min(OtherSize, OurSize);
-  assert(OurSize == MaxSize);
-  for (unsigned Idx = 1; Idx <= MergeCount; ++Idx) {
+  for (auto Idx : seq<unsigned>(1, MergeCount)) {
     for (auto T : inst_counter_types(Context->MaxCounter)) {
       StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
                               OtherMarks[OtherSize - Idx][T]);

>From 533eb5fb6963f1700a64ca080acc0d4c21d6fc3b Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Thu, 12 Feb 2026 13:19:52 +0530
Subject: [PATCH 2/3] integer range needs to be inclusive

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 0541efe2e693b..3fb8dee3a55c1 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2995,7 +2995,7 @@ bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
   unsigned OtherSize = OtherMarks.size();
   unsigned OurSize = AsyncMarks.size();
   unsigned MergeCount = std::min(OtherSize, OurSize);
-  for (auto Idx : seq<unsigned>(1, MergeCount)) {
+  for (auto Idx : seq_inclusive<unsigned>(1, MergeCount)) {
     for (auto T : inst_counter_types(Context->MaxCounter)) {
       StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
                               OtherMarks[OtherSize - Idx][T]);

>From 5d6c7e64d5de2d9bc2ab7af76667d5238db4575d Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Thu, 12 Feb 2026 20:18:30 +0530
Subject: [PATCH 3/3] Update llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Co-authored-by: Jay Foad <jay.foad at amd.com>
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 3fb8dee3a55c1..005d624971ee9 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1003,7 +1003,7 @@ class WaitcntBrackets {
   // State of all counters at each async mark encountered so far.
   SmallVector<CounterValueArray> AsyncMarks;
 
-  // For each backedge in isolation, the algorithm reachs a fixed point after
+  // For each backedge in isolation, the algorithm reaches a fixed point after
   // the first call to merge(). This is unchanged even with the AsyncMarks
   // array because we call mergeScore just like the other cases.
   //