[llvm-branch-commits] [llvm] [NFC][AMDGPU][InsertWaitCnts] Move some simple functions into Utils (PR #202936)

Thu Jun 11 04:15:51 PDT 2026

https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/202936

>From 419a3ea395f7eb13ce5564fdd7893d77469cea43 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 10 Jun 2026 12:09:49 +0200
Subject: [PATCH] [NFC][AMDGPU][InsertWaitCnts] Move some simple functions into
 Utils

Move really trivial functions into helpers to declutter InsertWaitCnt a bit more.
I had to move HardwareLimits into a different header but it's only used in InsertWaitCnt so it doesn't matter.
---
 llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp |  75 ++++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h   |  32 ++++++
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   | 107 ++++--------------
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  20 ----
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  20 ----
 5 files changed, 128 insertions(+), 126 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp
index df8d22fb5e3dd..75e757d0c8b2d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUWaitcntUtils.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 
 namespace llvm::AMDGPU {
@@ -47,6 +48,53 @@ StringLiteral getInstCounterName(InstCounterType T) {
   llvm_unreachable("Unhandled InstCounterType");
 }
 
+HardwareLimits::HardwareLimits(const IsaVersion &IV) {
+  bool HasExtendedWaitCounts = IV.Major >= 12;
+  if (HasExtendedWaitCounts) {
+    LoadcntMax = getLoadcntBitMask(IV);
+    DscntMax = getDscntBitMask(IV);
+  } else {
+    LoadcntMax = getVmcntBitMask(IV);
+    DscntMax = getLgkmcntBitMask(IV);
+  }
+  ExpcntMax = getExpcntBitMask(IV);
+  StorecntMax = getStorecntBitMask(IV);
+  SamplecntMax = getSamplecntBitMask(IV);
+  BvhcntMax = getBvhcntBitMask(IV);
+  KmcntMax = getKmcntBitMask(IV);
+  XcntMax = getXcntBitMask(IV);
+  AsyncMax = getAsynccntBitMask(IV);
+  VaVdstMax = DepCtr::getVaVdstBitMask();
+  VmVsrcMax = DepCtr::getVmVsrcBitMask();
+}
+
+unsigned HardwareLimits::get(InstCounterType T) const {
+  switch (T) {
+  case AMDGPU::LOAD_CNT:
+    return LoadcntMax;
+  case AMDGPU::DS_CNT:
+    return DscntMax;
+  case AMDGPU::EXP_CNT:
+    return ExpcntMax;
+  case AMDGPU::STORE_CNT:
+    return StorecntMax;
+  case AMDGPU::SAMPLE_CNT:
+    return SamplecntMax;
+  case AMDGPU::BVH_CNT:
+    return BvhcntMax;
+  case AMDGPU::KM_CNT:
+    return KmcntMax;
+  case AMDGPU::X_CNT:
+    return XcntMax;
+  case AMDGPU::VA_VDST:
+    return VaVdstMax;
+  case AMDGPU::VM_VSRC:
+    return VmVsrcMax;
+  default:
+    return 0;
+  }
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void Waitcnt::dump() const { dbgs() << *this << '\n'; }
 #endif
@@ -89,4 +137,31 @@ unsigned encodeStorecntDscnt(const IsaVersion &Version,
                              Decoded.get(DS_CNT));
 }
 
+std::optional<AMDGPU::InstCounterType> counterTypeForInstr(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::S_WAIT_LOADCNT:
+    return AMDGPU::LOAD_CNT;
+  case AMDGPU::S_WAIT_EXPCNT:
+    return AMDGPU::EXP_CNT;
+  case AMDGPU::S_WAIT_STORECNT:
+    return AMDGPU::STORE_CNT;
+  case AMDGPU::S_WAIT_SAMPLECNT:
+    return AMDGPU::SAMPLE_CNT;
+  case AMDGPU::S_WAIT_BVHCNT:
+    return AMDGPU::BVH_CNT;
+  case AMDGPU::S_WAIT_DSCNT:
+    return AMDGPU::DS_CNT;
+  case AMDGPU::S_WAIT_KMCNT:
+    return AMDGPU::KM_CNT;
+  case AMDGPU::S_WAIT_XCNT:
+    return AMDGPU::X_CNT;
+  case AMDGPU::S_WAIT_ASYNCCNT:
+    return AMDGPU::ASYNC_CNT;
+  case AMDGPU::S_WAIT_TENSORCNT:
+    return AMDGPU::TENSOR_CNT;
+  default:
+    return {};
+  }
+}
+
 } // namespace llvm::AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h
index 093d8a45d207b..0930a95288087 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h
@@ -46,6 +46,28 @@ StringLiteral getInstCounterName(InstCounterType T);
 iota_range<InstCounterType>
 inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS);
 
+/// Represents the hardware counter limits for different wait count types.
+struct HardwareLimits {
+  unsigned LoadcntMax; // Corresponds to Vmcnt prior to gfx12.
+  unsigned ExpcntMax;
+  unsigned DscntMax;     // Corresponds to LGKMcnt prior to gfx12.
+  unsigned StorecntMax;  // Corresponds to VScnt in gfx10/gfx11.
+  unsigned SamplecntMax; // gfx12+ only.
+  unsigned BvhcntMax;    // gfx12+ only.
+  unsigned KmcntMax;     // gfx12+ only.
+  unsigned XcntMax;      // gfx1250.
+  unsigned AsyncMax;     // gfx1250.
+  unsigned VaVdstMax;    // gfx12+ expert mode only.
+  unsigned VmVsrcMax;    // gfx12+ expert mode only.
+
+  HardwareLimits() = default;
+
+  /// Initializes hardware limits from ISA version.
+  HardwareLimits(const IsaVersion &IV);
+
+  unsigned get(InstCounterType T) const;
+};
+
 } // namespace AMDGPU
 
 template <> struct enum_iteration_traits<AMDGPU::InstCounterType> {
@@ -109,6 +131,12 @@ class Waitcnt {
     return false;
   }
 
+  void add(AMDGPU::InstCounterType T, unsigned Count) {
+    set(T, std::min(get(T), Count));
+  }
+
+  void clear(AMDGPU::InstCounterType T) { set(T, ~0u); }
+
   bool hasWaitStoreCnt() const { return Cnt[STORE_CNT] != ~0u; }
 
   bool hasWaitDepctr() const {
@@ -168,6 +196,10 @@ unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded);
 /// \p Version.
 unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded);
 
+/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
+/// and if so, which counter it is waiting on.
+std::optional<AMDGPU::InstCounterType> counterTypeForInstr(unsigned Opcode);
+
 } // namespace AMDGPU
 
 } // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 1bb6950d4f574..f13d401f70fa2 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -74,34 +74,6 @@ static cl::opt<bool> ExpertSchedulingModeFlag(
     cl::init(false), cl::Hidden);
 
 namespace {
-// Get the maximum wait count value for a given counter type.
-static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
-                                AMDGPU::InstCounterType T) {
-  switch (T) {
-  case AMDGPU::LOAD_CNT:
-    return Limits.LoadcntMax;
-  case AMDGPU::DS_CNT:
-    return Limits.DscntMax;
-  case AMDGPU::EXP_CNT:
-    return Limits.ExpcntMax;
-  case AMDGPU::STORE_CNT:
-    return Limits.StorecntMax;
-  case AMDGPU::SAMPLE_CNT:
-    return Limits.SamplecntMax;
-  case AMDGPU::BVH_CNT:
-    return Limits.BvhcntMax;
-  case AMDGPU::KM_CNT:
-    return Limits.KmcntMax;
-  case AMDGPU::X_CNT:
-    return Limits.XcntMax;
-  case AMDGPU::VA_VDST:
-    return Limits.VaVdstMax;
-  case AMDGPU::VM_VSRC:
-    return Limits.VmVsrcMax;
-  default:
-    return 0;
-  }
-}
 
 /// Integer IDs used to track vector memory locations we may have to wait on.
 /// Encoded as u16 chunks:
@@ -211,13 +183,6 @@ VmemType getVmemType(const MachineInstr &Inst) {
   return VMEM_NOSAMPLER;
 }
 
-void addWait(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T, unsigned Count) {
-  Wait.set(T, std::min(Wait.get(T), Count));
-}
-
-void setNoWait(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T) {
-  Wait.set(T, ~0u);
-}
 class WaitcntBrackets;
 
 // This abstracts the logic for generating and updating S_WAIT* instructions
@@ -701,7 +666,7 @@ class WaitcntBrackets {
 
   unsigned getPendingGDSWait() const {
     return std::min(getScoreUB(AMDGPU::DS_CNT) - LastGDS,
-                    getWaitCountMax(Context->getLimits(), AMDGPU::DS_CNT) - 1);
+                    Context->getLimits().get(AMDGPU::DS_CNT) - 1);
   }
 
   void setPendingGDS() { LastGDS = ScoreUBs[AMDGPU::DS_CNT]; }
@@ -730,7 +695,7 @@ class WaitcntBrackets {
   void setStateOnFunctionEntryOrReturn() {
     setScoreUB(AMDGPU::STORE_CNT,
                getScoreUB(AMDGPU::STORE_CNT) +
-                   getWaitCountMax(Context->getLimits(), AMDGPU::STORE_CNT));
+                   Context->getLimits().get(AMDGPU::STORE_CNT));
     PendingEvents |= Context->getWaitEvents(AMDGPU::STORE_CNT);
   }
 
@@ -787,10 +752,9 @@ class WaitcntBrackets {
       return;
 
     if (getScoreRange(AMDGPU::EXP_CNT) >
-        getWaitCountMax(Context->getLimits(), AMDGPU::EXP_CNT))
+        Context->getLimits().get(AMDGPU::EXP_CNT))
       ScoreLBs[AMDGPU::EXP_CNT] =
-          ScoreUBs[AMDGPU::EXP_CNT] -
-          getWaitCountMax(Context->getLimits(), AMDGPU::EXP_CNT);
+          ScoreUBs[AMDGPU::EXP_CNT] - Context->getLimits().get(AMDGPU::EXP_CNT);
   }
 
   void setRegScore(MCPhysReg Reg, AMDGPU::InstCounterType T, unsigned Val) {
@@ -1423,18 +1387,18 @@ void WaitcntBrackets::determineWaitForScore(AMDGPU::InstCounterType T,
       // If there is a pending FLAT operation, and this is a VMem or LGKM
       // waitcnt and the target can report early completion, then we need
       // to force a waitcnt 0.
-      addWait(Wait, T, 0);
+      Wait.add(T, 0);
     } else if (counterOutOfOrder(T)) {
       // Counter can get decremented out-of-order when there
       // are multiple types event in the bracket. Also emit an s_wait counter
       // with a conservative value of 0 for the counter.
-      addWait(Wait, T, 0);
+      Wait.add(T, 0);
     } else {
       // If a counter has been maxed out avoid overflow by waiting for
       // MAX(CounterType) - 1 instead.
-      unsigned NeededWait = std::min(
-          UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
-      addWait(Wait, T, NeededWait);
+      unsigned NeededWait =
+          std::min(UB - ScoreToWait, Context->getLimits().get(T) - 1);
+      Wait.add(T, NeededWait);
     }
   }
 }
@@ -1662,36 +1626,6 @@ static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
   return true;
 }
 
-/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
-/// and if so, which counter it is waiting on.
-static std::optional<AMDGPU::InstCounterType>
-counterTypeForInstr(unsigned Opcode) {
-  switch (Opcode) {
-  case AMDGPU::S_WAIT_LOADCNT:
-    return AMDGPU::LOAD_CNT;
-  case AMDGPU::S_WAIT_EXPCNT:
-    return AMDGPU::EXP_CNT;
-  case AMDGPU::S_WAIT_STORECNT:
-    return AMDGPU::STORE_CNT;
-  case AMDGPU::S_WAIT_SAMPLECNT:
-    return AMDGPU::SAMPLE_CNT;
-  case AMDGPU::S_WAIT_BVHCNT:
-    return AMDGPU::BVH_CNT;
-  case AMDGPU::S_WAIT_DSCNT:
-    return AMDGPU::DS_CNT;
-  case AMDGPU::S_WAIT_KMCNT:
-    return AMDGPU::KM_CNT;
-  case AMDGPU::S_WAIT_XCNT:
-    return AMDGPU::X_CNT;
-  case AMDGPU::S_WAIT_ASYNCCNT:
-    return AMDGPU::ASYNC_CNT;
-  case AMDGPU::S_WAIT_TENSORCNT:
-    return AMDGPU::TENSOR_CNT;
-  default:
-    return {};
-  }
-}
-
 bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
   unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
   if (Opcode == Waitcnt->getOpcode())
@@ -1880,7 +1814,7 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
             continue;
 
           unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
-                                          getWaitCountMax(getLimits(), CT) - 1);
+                                          getLimits().get(CT) - 1);
           EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
             AMDGPU::Waitcnt W;
             W.set(CT, Count);
@@ -1910,7 +1844,7 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
       // Only expand if counter is not out-of-order
       unsigned Outstanding =
           std::min(ScoreBrackets.getOutstanding(AMDGPU::STORE_CNT),
-                   getWaitCountMax(getLimits(), AMDGPU::STORE_CNT) - 1);
+                   getLimits().get(AMDGPU::STORE_CNT) - 1);
       EmitExpandedWaitcnt(
           Outstanding, Wait.get(AMDGPU::STORE_CNT), [&](unsigned Count) {
             BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
@@ -2064,14 +1998,15 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
       Wait = Wait.combined(OldWait);
     } else {
-      std::optional<AMDGPU::InstCounterType> CT = counterTypeForInstr(Opcode);
+      std::optional<AMDGPU::InstCounterType> CT =
+          AMDGPU::counterTypeForInstr(Opcode);
       assert(CT.has_value());
       unsigned OldCnt =
           TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
       if (TrySimplify)
-        addWait(Wait, CT.value(), OldCnt);
+        Wait.add(CT.value(), OldCnt);
       else
-        addWait(RequiredWait, CT.value(), OldCnt);
+        RequiredWait.add(CT.value(), OldCnt);
       // Keep the first wait of its kind, erase the rest.
       if (WaitInstrs[CT.value()] == nullptr) {
         WaitInstrs[CT.value()] = &II;
@@ -2188,7 +2123,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
 
       ScoreBrackets.applyWaitcnt(CT, NewCnt);
-      setNoWait(Wait, CT);
+      Wait.clear(CT);
 
       LLVM_DEBUG(It.isEnd()
                      ? dbgs() << "applied pre-existing waitcnt\n"
@@ -2272,8 +2207,8 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
         continue;
       }
 
-      unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
-                                      getWaitCountMax(getLimits(), CT) - 1);
+      unsigned Outstanding =
+          std::min(ScoreBrackets.getOutstanding(CT), getLimits().get(CT) - 1);
       EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
         BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
             .addImm(Val);
@@ -2455,7 +2390,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
     // Wait for any pending GDS instruction to complete before any
     // "Always GDS" instruction.
     if (TII.isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
-      addWait(Wait, AMDGPU::DS_CNT, ScoreBrackets.getPendingGDSWait());
+      Wait.add(AMDGPU::DS_CNT, ScoreBrackets.getPendingGDSWait());
 
     if (MI.isCall()) {
       // The function is going to insert a wait on everything in its prolog.
@@ -2496,7 +2431,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
         const Value *Ptr = Memop->getValue();
         if (Memop->isStore()) {
           if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
-            addWait(Wait, SmemAccessCounter, 0);
+            Wait.add(SmemAccessCounter, 0);
             if (PDT.dominates(MI.getParent(), It->second))
               SLoadAddresses.erase(It);
           }
@@ -2993,7 +2928,7 @@ static bool isWaitInstr(MachineInstr &Inst) {
          Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
          Opcode == AMDGPU::S_WAITCNT_lds_direct ||
          Opcode == AMDGPU::WAIT_ASYNCMARK ||
-         counterTypeForInstr(Opcode).has_value();
+         AMDGPU::counterTypeForInstr(Opcode).has_value();
 }
 
 void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 33df51e8a7e07..3cec436b61701 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1835,26 +1835,6 @@ unsigned getStorecntBitMask(const IsaVersion &Version) {
   return (1 << getStorecntBitWidth(Version.Major)) - 1;
 }
 
-HardwareLimits::HardwareLimits(const IsaVersion &IV) {
-  bool HasExtendedWaitCounts = IV.Major >= 12;
-  if (HasExtendedWaitCounts) {
-    LoadcntMax = getLoadcntBitMask(IV);
-    DscntMax = getDscntBitMask(IV);
-  } else {
-    LoadcntMax = getVmcntBitMask(IV);
-    DscntMax = getLgkmcntBitMask(IV);
-  }
-  ExpcntMax = getExpcntBitMask(IV);
-  StorecntMax = getStorecntBitMask(IV);
-  SamplecntMax = getSamplecntBitMask(IV);
-  BvhcntMax = getBvhcntBitMask(IV);
-  KmcntMax = getKmcntBitMask(IV);
-  XcntMax = getXcntBitMask(IV);
-  AsyncMax = getAsynccntBitMask(IV);
-  VaVdstMax = DepCtr::getVaVdstBitMask();
-  VmVsrcMax = DepCtr::getVmVsrcBitMask();
-}
-
 unsigned getWaitcntBitMask(const IsaVersion &Version) {
   unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major),
                                 getVmcntBitWidthLo(Version.Major));
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 1f7084c8d25ae..e669add7ea980 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1101,26 +1101,6 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size);
 /// Checks if \p Val is inside \p MD, a !range-like metadata.
 bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val);
 
-/// Represents the hardware counter limits for different wait count types.
-struct HardwareLimits {
-  unsigned LoadcntMax; // Corresponds to Vmcnt prior to gfx12.
-  unsigned ExpcntMax;
-  unsigned DscntMax;     // Corresponds to LGKMcnt prior to gfx12.
-  unsigned StorecntMax;  // Corresponds to VScnt in gfx10/gfx11.
-  unsigned SamplecntMax; // gfx12+ only.
-  unsigned BvhcntMax;    // gfx12+ only.
-  unsigned KmcntMax;     // gfx12+ only.
-  unsigned XcntMax;      // gfx1250.
-  unsigned AsyncMax;     // gfx1250.
-  unsigned VaVdstMax;    // gfx12+ expert mode only.
-  unsigned VmVsrcMax;    // gfx12+ expert mode only.
-
-  HardwareLimits() = default;
-
-  /// Initializes hardware limits from ISA version.
-  HardwareLimits(const IsaVersion &IV);
-};
-
 // The following methods are only meaningful on targets that support
 // S_WAITCNT.