[clang] [llvm] Revert "[AMDGPU][SIInsertWaitcnt] Implement Waitcnt Expansion for Profiling" (PR #175904)

Wed Jan 14 00:47:28 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-clang-codegen

Author: Pankaj Dwivedi (PankajDwivedi-25)

<details>
<summary>Changes</summary>

Reverts llvm/llvm-project#169345 to address the reviews

---

Patch is 68.32 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/175904.diff


7 Files Affected:

- (modified) clang/include/clang/Basic/CodeGenOptions.def (-4) 
- (modified) clang/include/clang/Options/Options.td (-7) 
- (modified) clang/lib/CodeGen/Targets/AMDGPU.cpp (-2) 
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+92-203) 
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (-19) 
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (-20) 
- (removed) llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll (-944) 


``````````diff

diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index baf8b093c10e6..6cdbffc456193 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -466,10 +466,6 @@ CODEGENOPT(AAPCSBitfieldWidth, 1, 1, Benign)
 /// propagate signaling NaN inputs per IEEE 754-2008 (AMDGPU Only)
 CODEGENOPT(EmitIEEENaNCompliantInsts, 1, 1, Benign)
 
-/// Enable expanded waitcnt for profiling (AMDGPU Only)
-/// Expands s_waitcnt instructions to help PC-sampling profilers identify stalls.
-CODEGENOPT(AMDGPUExpandWaitcntProfiling, 1, 0, Benign)
-
 // Whether to emit Swift Async function extended frame information: auto,
 // never, always.
 ENUM_CODEGENOPT(SwiftAsyncFramePointer, SwiftAsyncFramePointerKind, 2,
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 2f57a5b13b917..5ad0ff2a773c8 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -5585,13 +5585,6 @@ defm amdgpu_ieee : BoolMOption<"amdgpu-ieee",
   "This option changes the ABI. (AMDGPU only)">,
   NegFlag<SetFalse, [], [ClangOption, CC1Option]>>;
 
-defm amdgpu_expand_waitcnt_profiling : BoolMOption<"amdgpu-expand-waitcnt-profiling",
-  CodeGenOpts<"AMDGPUExpandWaitcntProfiling">, DefaultFalse,
-  PosFlag<SetTrue, [], [ClangOption, CC1Option], "Expand s_waitcnt instructions to help "
-  "PC-sampling profilers identify memory stalls. Instead of a single waitcnt(target), "
-  "emits waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target). (AMDGPU only)">,
-  NegFlag<SetFalse, [], [ClangOption]>>;
-
 def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, Group<m_Group>,
   HelpText<"Specify code object ABI version. Defaults to 6. (AMDGPU only)">,
   Visibility<[ClangOption, FlangOption, CC1Option, FC1Option]>,
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 4bc9557b26b52..0ab6c753b8bad 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -443,8 +443,6 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
     setFunctionDeclAttributes(FD, F, M);
   if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
     F->addFnAttr("amdgpu-ieee", "false");
-  if (getABIInfo().getCodeGenOpts().AMDGPUExpandWaitcntProfiling)
-    F->addFnAttr("amdgpu-expand-waitcnt-profiling");
 }
 
 unsigned AMDGPUTargetCodeGenInfo::getDeviceKernelCallingConv() const {
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b3e834b66ad45..bf842e0ecb4af 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -105,35 +105,6 @@ auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
   return enum_seq(LOAD_CNT, MaxCounter);
 }
 
-// Get the maximum wait count value for a given counter type.
-static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
-                                InstCounterType T) {
-  switch (T) {
-  case LOAD_CNT:
-    return Limits.LoadcntMax;
-  case DS_CNT:
-    return Limits.DscntMax;
-  case EXP_CNT:
-    return Limits.ExpcntMax;
-  case STORE_CNT:
-    return Limits.StorecntMax;
-  case SAMPLE_CNT:
-    return Limits.SamplecntMax;
-  case BVH_CNT:
-    return Limits.BvhcntMax;
-  case KM_CNT:
-    return Limits.KmcntMax;
-  case X_CNT:
-    return Limits.XcntMax;
-  case VA_VDST:
-    return Limits.VaVdstMax;
-  case VM_VSRC:
-    return Limits.VmVsrcMax;
-  default:
-    return 0;
-  }
-}
-
 /// Integer IDs used to track vector memory locations we may have to wait on.
 /// Encoded as u16 chunks:
 ///
@@ -169,6 +140,19 @@ static constexpr VMEMID toVMEMID(MCRegUnit RU) {
   return static_cast<unsigned>(RU);
 }
 
+struct HardwareLimits {
+  unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
+  unsigned ExpcntMax;
+  unsigned DscntMax;     // Corresponds to LGKMcnt prior to gfx12.
+  unsigned StorecntMax;  // Corresponds to VScnt in gfx10/gfx11.
+  unsigned SamplecntMax; // gfx12+ only.
+  unsigned BvhcntMax;    // gfx12+ only.
+  unsigned KmcntMax;     // gfx12+ only.
+  unsigned XcntMax;      // gfx1250.
+  unsigned VaVdstMax;    // gfx12+ expert mode only.
+  unsigned VmVsrcMax;    // gfx12+ expert mode only.
+};
+
 #define AMDGPU_DECLARE_WAIT_EVENTS(DECL)                                       \
   DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */    \
   DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */         \
@@ -330,27 +314,19 @@ class WaitcntGenerator {
   AMDGPU::IsaVersion IV;
   InstCounterType MaxCounter;
   bool OptNone;
-  bool ExpandWaitcntProfiling = false;
-  const AMDGPU::HardwareLimits *Limits = nullptr;
 
 public:
   WaitcntGenerator() = default;
-  WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter,
-                   const AMDGPU::HardwareLimits *Limits)
+  WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
       : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
         IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
         OptNone(MF.getFunction().hasOptNone() ||
-                MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
-        ExpandWaitcntProfiling(
-            MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
-        Limits(Limits) {}
+                MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
 
   // Return true if the current function should be compiled with no
   // optimization.
   bool isOptNone() const { return OptNone; }
 
-  const AMDGPU::HardwareLimits &getLimits() const { return *Limits; }
-
   // Edits an existing sequence of wait count instructions according
   // to an incoming Waitcnt value, which is itself updated to reflect
   // any new wait count instructions which may need to be generated by
@@ -372,11 +348,9 @@ class WaitcntGenerator {
 
   // Generates new wait count instructions according to the  value of
   // Wait, returning true if any new instructions were created.
-  // If ScoreBrackets is provided, it can be used for profiling expansion.
   virtual bool createNewWaitcnt(MachineBasicBlock &Block,
                                 MachineBasicBlock::instr_iterator It,
-                                AMDGPU::Waitcnt Wait,
-                                WaitcntBrackets *ScoreBrackets = nullptr) = 0;
+                                AMDGPU::Waitcnt Wait) = 0;
 
   // Returns an array of bit masks which can be used to map values in
   // WaitEventType to corresponding counter values in InstCounterType.
@@ -401,10 +375,7 @@ class WaitcntGenerator {
 
 class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
 public:
-  WaitcntGeneratorPreGFX12() = default;
-  WaitcntGeneratorPreGFX12(const MachineFunction &MF,
-                           const AMDGPU::HardwareLimits *Limits)
-      : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS, Limits) {}
+  using WaitcntGenerator::WaitcntGenerator;
 
   bool
   applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -413,8 +384,7 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
 
   bool createNewWaitcnt(MachineBasicBlock &Block,
                         MachineBasicBlock::instr_iterator It,
-                        AMDGPU::Waitcnt Wait,
-                        WaitcntBrackets *ScoreBrackets = nullptr) override;
+                        AMDGPU::Waitcnt Wait) override;
 
   const unsigned *getWaitEventMask() const override {
     assert(ST);
@@ -446,10 +416,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
 public:
   WaitcntGeneratorGFX12Plus() = default;
   WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
-                            InstCounterType MaxCounter,
-                            const AMDGPU::HardwareLimits *Limits,
-                            bool IsExpertMode)
-      : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
+                            InstCounterType MaxCounter, bool IsExpertMode)
+      : WaitcntGenerator(MF, MaxCounter), IsExpertMode(IsExpertMode) {}
 
   bool
   applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -458,8 +426,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
 
   bool createNewWaitcnt(MachineBasicBlock &Block,
                         MachineBasicBlock::instr_iterator It,
-                        AMDGPU::Waitcnt Wait,
-                        WaitcntBrackets *ScoreBrackets = nullptr) override;
+                        AMDGPU::Waitcnt Wait) override;
 
   const unsigned *getWaitEventMask() const override {
     assert(ST);
@@ -533,7 +500,7 @@ class SIInsertWaitcnts {
   // message.
   DenseSet<MachineInstr *> ReleaseVGPRInsts;
 
-  AMDGPU::HardwareLimits Limits;
+  HardwareLimits Limits;
 
 public:
   SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
@@ -544,7 +511,33 @@ class SIInsertWaitcnts {
     (void)ForceVMCounter;
   }
 
-  const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
+  unsigned getWaitCountMax(InstCounterType T) const {
+    switch (T) {
+    case LOAD_CNT:
+      return Limits.LoadcntMax;
+    case DS_CNT:
+      return Limits.DscntMax;
+    case EXP_CNT:
+      return Limits.ExpcntMax;
+    case STORE_CNT:
+      return Limits.StorecntMax;
+    case SAMPLE_CNT:
+      return Limits.SamplecntMax;
+    case BVH_CNT:
+      return Limits.BvhcntMax;
+    case KM_CNT:
+      return Limits.KmcntMax;
+    case X_CNT:
+      return Limits.XcntMax;
+    case VA_VDST:
+      return Limits.VaVdstMax;
+    case VM_VSRC:
+      return Limits.VmVsrcMax;
+    default:
+      break;
+    }
+    return 0;
+  }
 
   PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
                                              const WaitcntBrackets &Brackets);
@@ -776,7 +769,7 @@ class WaitcntBrackets {
 
   unsigned getPendingGDSWait() const {
     return std::min(getScoreUB(DS_CNT) - LastGDS,
-                    getWaitCountMax(Context->getLimits(), DS_CNT) - 1);
+                    Context->getWaitCountMax(DS_CNT) - 1);
   }
 
   void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
@@ -803,8 +796,8 @@ class WaitcntBrackets {
   }
 
   void setStateOnFunctionEntryOrReturn() {
-    setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) +
-                              getWaitCountMax(Context->getLimits(), STORE_CNT));
+    setScoreUB(STORE_CNT,
+               getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
     PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
   }
 
@@ -860,9 +853,8 @@ class WaitcntBrackets {
     if (T != EXP_CNT)
       return;
 
-    if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT))
-      ScoreLBs[EXP_CNT] =
-          ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT);
+    if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
+      ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
   }
 
   void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
@@ -1365,8 +1357,8 @@ void WaitcntBrackets::determineWaitForScore(InstCounterType T,
     } else {
       // If a counter has been maxed out avoid overflow by waiting for
       // MAX(CounterType) - 1 instead.
-      unsigned NeededWait = std::min(
-          UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
+      unsigned NeededWait =
+          std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
       addWait(Wait, T, NeededWait);
     }
   }
@@ -1683,109 +1675,38 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
 /// required counters in \p Wait
 bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
     MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
-    AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
+    AMDGPU::Waitcnt Wait) {
   assert(ST);
   assert(isNormalMode(MaxCounter));
 
   bool Modified = false;
   const DebugLoc &DL = Block.findDebugLoc(It);
 
-  // Helper to emit expanded waitcnt sequence for profiling.
-  // Emits waitcnts from (Outstanding-1) down to Target, or just Target if
-  // nothing to expand. The EmitWaitcnt callback emits a single waitcnt.
-  auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
-                                 auto EmitWaitcnt) {
-    if (Outstanding > Target) {
-      for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
-        EmitWaitcnt(i);
-        Modified = true;
-      }
-    } else {
-      EmitWaitcnt(Target);
-      Modified = true;
-    }
-  };
-
   // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
   // single instruction while VScnt has its own instruction.
   if (Wait.hasWaitExceptStoreCnt()) {
-    // If profiling expansion is enabled and we have score brackets,
-    // emit an expanded sequence
-    if (ExpandWaitcntProfiling && ScoreBrackets) {
-      // Check if any of the counters to be waited on are out-of-order.
-      // If so, fall back to normal (non-expanded) behavior since expansion
-      // would provide misleading profiling information.
-      bool AnyOutOfOrder = false;
-      for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
-        unsigned &WaitCnt = getCounterRef(Wait, CT);
-        if (WaitCnt != ~0u && ScoreBrackets->counterOutOfOrder(CT)) {
-          AnyOutOfOrder = true;
-          break;
-        }
-      }
-
-      if (AnyOutOfOrder) {
-        // Fall back to non-expanded wait
-        unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+    unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+    [[maybe_unused]] auto SWaitInst =
         BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
-        Modified = true;
-      } else {
-        // All counters are in-order, safe to expand
-        for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
-          unsigned &WaitCnt = getCounterRef(Wait, CT);
-          if (WaitCnt == ~0u)
-            continue;
-
-          unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
-                                              ScoreBrackets->getScoreLB(CT),
-                                          getWaitCountMax(getLimits(), CT) - 1);
-          EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
-            AMDGPU::Waitcnt W;
-            getCounterRef(W, CT) = Count;
-            BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
-                .addImm(AMDGPU::encodeWaitcnt(IV, W));
-          });
-        }
-      }
-    } else {
-      // Normal behavior: emit single combined waitcnt
-      unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
-      [[maybe_unused]] auto SWaitInst =
-          BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
-      Modified = true;
+    Modified = true;
 
-      LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
-                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
-                 dbgs() << "New Instr: " << *SWaitInst << '\n');
-    }
+    LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
+               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+               dbgs() << "New Instr: " << *SWaitInst << '\n');
   }
 
   if (Wait.hasWaitStoreCnt()) {
     assert(ST->hasVscnt());
 
-    if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u &&
-        !ScoreBrackets->counterOutOfOrder(STORE_CNT)) {
-      // Only expand if counter is not out-of-order
-      unsigned Outstanding =
-          std::min(ScoreBrackets->getScoreUB(STORE_CNT) -
-                       ScoreBrackets->getScoreLB(STORE_CNT),
-                   getWaitCountMax(getLimits(), STORE_CNT) - 1);
-      EmitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) {
+    [[maybe_unused]] auto SWaitInst =
         BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
             .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
-            .addImm(Count);
-      });
-    } else {
-      [[maybe_unused]] auto SWaitInst =
-          BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
-              .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
-              .addImm(Wait.StoreCnt);
-      Modified = true;
+            .addImm(Wait.StoreCnt);
+    Modified = true;
 
-      LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
-                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
-                 dbgs() << "New Instr: " << *SWaitInst << '\n');
-    }
+    LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
+               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+               dbgs() << "New Instr: " << *SWaitInst << '\n');
   }
 
   return Modified;
@@ -2082,55 +2003,13 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
 /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
 bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
     MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
-    AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
+    AMDGPU::Waitcnt Wait) {
   assert(ST);
   assert(!isNormalMode(MaxCounter));
 
   bool Modified = false;
   const DebugLoc &DL = Block.findDebugLoc(It);
 
-  // Helper to emit expanded waitcnt sequence for profiling.
-  auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
-                                 auto EmitWaitcnt) {
-    if (Outstanding > Target) {
-      for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
-        EmitWaitcnt(i);
-        Modified = true;
-      }
-    } else {
-      EmitWaitcnt(Target);
-      Modified = true;
-    }
-  };
-
-  // For GFX12+, we use separate wait instructions, which makes expansion
-  // simpler
-  if (ExpandWaitcntProfiling && ScoreBrackets) {
-    for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
-      unsigned Count = getWait(Wait, CT);
-      if (Count == ~0u)
-        continue;
-
-      // Skip expansion for out-of-order counters - emit normal wait instead
-      if (ScoreBrackets->counterOutOfOrder(CT)) {
-        BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
-            .addImm(Count);
-        Modified = true;
-        continue;
-      }
-
-      unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
-                                          ScoreBrackets->getScoreLB(CT),
-                                      getWaitCountMax(getLimits(), CT) - 1);
-      EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
-        BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
-            .addImm(Val);
-      });
-    }
-    return Modified;
-  }
-
-  // Normal behavior (no expansion)
   // Check for opportunities to use combined wait instructions.
   if (Wait.DsCnt != ~0u) {
     MachineInstr *SWaitInst = nullptr;
@@ -2529,7 +2408,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
     Modified =
         WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
 
-  AMDGPU::Waitcnt WaitForScore = Wait;
+  // Any counts that could have been applied to any existing waitcnt
+  // instructions will have been done so, now deal with any remaining.
+  ScoreBrackets.applyWaitcnt(Wait);
 
   // ExpCnt can be merged into VINTERP.
   if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
@@ -2546,13 +2427,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
                       << "Update Instr: " << *It);
   }
 
-  if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets))
+  if (WCG->createNewWaitcnt(Block, It, Wait))
     Modified = true;
 
-  // Any counts that could have been applied to any existing waitcnt
-  // instructions will have been done so, now deal with any remaining.
-  ScoreBrackets.applyWaitcnt(WaitForScore);
-
   return Modified;
 }
 
@@ -3259,9 +3136,6 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
 
   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
 
-  // Initialize hardware limits first, as they're needed by the generators.
-  Limits = AMDGPU::HardwareLimits(IV, ST->hasExtendedWaitCounts());
-
   if (ST->hasExtendedWaitCounts()) {
     IsExpertMode = ST->hasExpertSchedulingMode() &&
                    (ExpertSchedulingModeFlag.getNumOccurrences()
@@ -3270,12 +3144,11 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
                               .getFnAttrib...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/175904