[llvm] [AMDGPU][SIInsertWaitcnts][NFC] Drop `using llvm::AMDGPU` (PR #180782)

Tue Feb 10 09:13:31 PST 2026

https://github.com/vporpo created https://github.com/llvm/llvm-project/pull/180782

This is a followup patch for PR https://github.com/llvm/llvm-project/pull/178345 which introduced `using llvm::AMDGPU` to keep the patch size small.

>From 800d1048c433eaecc927bb7495f216f4cfe5c66d Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vasileios.porpodas at amd.com>
Date: Tue, 10 Feb 2026 17:01:38 +0000
Subject: [PATCH] [AMDGPU][SIInsertWaitcnts][NFC] Drop `using llvm::AMDGPU`

This is a followup patch for PR https://github.com/llvm/llvm-project/pull/178345 which introduced `using llvm::AMDGPU` to keep the patch size small.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 513 ++++++++++----------
 1 file changed, 266 insertions(+), 247 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 7dfe0da7ef81a..94c0883dbcb29 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -42,7 +42,6 @@
 #include "llvm/TargetParser/TargetParser.h"
 
 using namespace llvm;
-using namespace llvm::AMDGPU;
 
 #define DEBUG_TYPE "si-insert-waitcnts"
 
@@ -72,27 +71,27 @@ static cl::opt<bool> ExpertSchedulingModeFlag(
 namespace {
 // Get the maximum wait count value for a given counter type.
 static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
-                                InstCounterType T) {
+                                AMDGPU::InstCounterType T) {
   switch (T) {
-  case LOAD_CNT:
+  case AMDGPU::LOAD_CNT:
     return Limits.LoadcntMax;
-  case DS_CNT:
+  case AMDGPU::DS_CNT:
     return Limits.DscntMax;
-  case EXP_CNT:
+  case AMDGPU::EXP_CNT:
     return Limits.ExpcntMax;
-  case STORE_CNT:
+  case AMDGPU::STORE_CNT:
     return Limits.StorecntMax;
-  case SAMPLE_CNT:
+  case AMDGPU::SAMPLE_CNT:
     return Limits.SamplecntMax;
-  case BVH_CNT:
+  case AMDGPU::BVH_CNT:
     return Limits.BvhcntMax;
-  case KM_CNT:
+  case AMDGPU::KM_CNT:
     return Limits.KmcntMax;
-  case X_CNT:
+  case AMDGPU::X_CNT:
     return Limits.XcntMax;
-  case VA_VDST:
+  case AMDGPU::VA_VDST:
     return Limits.VaVdstMax;
-  case VM_VSRC:
+  case AMDGPU::VM_VSRC:
     return Limits.VmVsrcMax;
   default:
     return 0;
@@ -229,10 +228,12 @@ enum VmemType {
 // Maps values of InstCounterType to the instruction that waits on that
 // counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
 // returns true, and does not cover VA_VDST or VM_VSRC.
-static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
-    AMDGPU::S_WAIT_LOADCNT,  AMDGPU::S_WAIT_DSCNT,     AMDGPU::S_WAIT_EXPCNT,
-    AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
-    AMDGPU::S_WAIT_KMCNT,    AMDGPU::S_WAIT_XCNT};
+static const unsigned
+    instrsForExtendedCounterTypes[AMDGPU::NUM_EXTENDED_INST_CNTS] = {
+        AMDGPU::S_WAIT_LOADCNT,   AMDGPU::S_WAIT_DSCNT,
+        AMDGPU::S_WAIT_EXPCNT,    AMDGPU::S_WAIT_STORECNT,
+        AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
+        AMDGPU::S_WAIT_KMCNT,     AMDGPU::S_WAIT_XCNT};
 
 static bool updateVMCntOnly(const MachineInstr &Inst) {
   return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
@@ -240,8 +241,8 @@ static bool updateVMCntOnly(const MachineInstr &Inst) {
 }
 
 #ifndef NDEBUG
-static bool isNormalMode(InstCounterType MaxCounter) {
-  return MaxCounter == NUM_NORMAL_INST_CNTS;
+  static bool isNormalMode(AMDGPU::InstCounterType MaxCounter) {
+  return MaxCounter == AMDGPU::NUM_NORMAL_INST_CNTS;
 }
 #endif // NDEBUG
 
@@ -265,11 +266,13 @@ VmemType getVmemType(const MachineInstr &Inst) {
   return VMEM_NOSAMPLER;
 }
 
-void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
+void addWait(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T, unsigned Count) {
   Wait.set(T, std::min(Wait.get(T), Count));
 }
 
-void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { Wait.set(T, ~0u); }
+void setNoWait(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T) {
+  Wait.set(T, ~0u);
+}
 
 /// A small set of events.
 class WaitEventSet {
@@ -354,7 +357,7 @@ class WaitcntGenerator {
   const GCNSubtarget &ST;
   const SIInstrInfo &TII;
   AMDGPU::IsaVersion IV;
-  InstCounterType MaxCounter;
+  AMDGPU::InstCounterType MaxCounter;
   bool OptNone;
   bool ExpandWaitcntProfiling = false;
   const AMDGPU::HardwareLimits *Limits = nullptr;
@@ -362,7 +365,8 @@ class WaitcntGenerator {
 public:
   WaitcntGenerator() = delete;
   WaitcntGenerator(const WaitcntGenerator &) = delete;
-  WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter,
+  WaitcntGenerator(const MachineFunction &MF,
+                   AMDGPU::InstCounterType MaxCounter,
                    const AMDGPU::HardwareLimits *Limits)
       : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
         IV(AMDGPU::getIsaVersion(ST.getCPU())), MaxCounter(MaxCounter),
@@ -406,11 +410,12 @@ class WaitcntGenerator {
                                 const WaitcntBrackets &ScoreBrackets) = 0;
 
   // Returns the WaitEventSet that corresponds to counter \p T.
-  virtual const WaitEventSet &getWaitEvents(InstCounterType T) const = 0;
+  virtual const WaitEventSet &
+  getWaitEvents(AMDGPU::InstCounterType T) const = 0;
 
   /// \returns the counter that corresponds to event \p E.
-  InstCounterType getCounterFromEvent(WaitEventType E) const {
-    for (auto T : inst_counter_types()) {
+  AMDGPU::InstCounterType getCounterFromEvent(WaitEventType E) const {
+    for (auto T : AMDGPU::inst_counter_types()) {
       if (getWaitEvents(T).contains(E))
         return T;
     }
@@ -426,7 +431,7 @@ class WaitcntGenerator {
 
 class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
   static constexpr const WaitEventSet
-      WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
+      WaitEventMaskForInstPreGFX12[AMDGPU::NUM_INST_CNTS] = {
           WaitEventSet(
               {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
           WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
@@ -452,7 +457,7 @@ class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
                         AMDGPU::Waitcnt Wait,
                         const WaitcntBrackets &ScoreBrackets) override;
 
-  const WaitEventSet &getWaitEvents(InstCounterType T) const override {
+  const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const override {
     return WaitEventMaskForInstPreGFX12[T];
   }
 
@@ -463,7 +468,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
 protected:
   bool IsExpertMode;
   static constexpr const WaitEventSet
-      WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
+      WaitEventMaskForInstGFX12Plus[AMDGPU::NUM_INST_CNTS] = {
           WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
           WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
           WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
@@ -480,7 +485,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
 public:
   WaitcntGeneratorGFX12Plus() = delete;
   WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
-                            InstCounterType MaxCounter,
+                            AMDGPU::InstCounterType MaxCounter,
                             const AMDGPU::HardwareLimits *Limits,
                             bool IsExpertMode)
       : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
@@ -495,7 +500,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
                         AMDGPU::Waitcnt Wait,
                         const WaitcntBrackets &ScoreBrackets) override;
 
-  const WaitEventSet &getWaitEvents(InstCounterType T) const override {
+  const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const override {
     return WaitEventMaskForInstGFX12Plus[T];
   }
 
@@ -514,8 +519,8 @@ class SIInsertWaitcnts {
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
-  InstCounterType SmemAccessCounter;
-  InstCounterType MaxCounter;
+  AMDGPU::InstCounterType SmemAccessCounter;
+  AMDGPU::InstCounterType MaxCounter;
   bool IsExpertMode = false;
 
 private:
@@ -532,7 +537,7 @@ class SIInsertWaitcnts {
 
   MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
 
-  bool ForceEmitWaitcnt[NUM_INST_CNTS];
+  bool ForceEmitWaitcnt[AMDGPU::NUM_INST_CNTS];
 
   std::unique_ptr<WaitcntGenerator> WCG;
 
@@ -573,33 +578,33 @@ class SIInsertWaitcnts {
 #ifndef NDEBUG
     if (DebugCounter::isCounterSet(ForceExpCounter) &&
         DebugCounter::shouldExecute(ForceExpCounter)) {
-      ForceEmitWaitcnt[EXP_CNT] = true;
+      ForceEmitWaitcnt[AMDGPU::EXP_CNT] = true;
     } else {
-      ForceEmitWaitcnt[EXP_CNT] = false;
+      ForceEmitWaitcnt[AMDGPU::EXP_CNT] = false;
     }
 
     if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
         DebugCounter::shouldExecute(ForceLgkmCounter)) {
-      ForceEmitWaitcnt[DS_CNT] = true;
-      ForceEmitWaitcnt[KM_CNT] = true;
+      ForceEmitWaitcnt[AMDGPU::DS_CNT] = true;
+      ForceEmitWaitcnt[AMDGPU::KM_CNT] = true;
     } else {
-      ForceEmitWaitcnt[DS_CNT] = false;
-      ForceEmitWaitcnt[KM_CNT] = false;
+      ForceEmitWaitcnt[AMDGPU::DS_CNT] = false;
+      ForceEmitWaitcnt[AMDGPU::KM_CNT] = false;
     }
 
     if (DebugCounter::isCounterSet(ForceVMCounter) &&
         DebugCounter::shouldExecute(ForceVMCounter)) {
-      ForceEmitWaitcnt[LOAD_CNT] = true;
-      ForceEmitWaitcnt[SAMPLE_CNT] = true;
-      ForceEmitWaitcnt[BVH_CNT] = true;
+      ForceEmitWaitcnt[AMDGPU::LOAD_CNT] = true;
+      ForceEmitWaitcnt[AMDGPU::SAMPLE_CNT] = true;
+      ForceEmitWaitcnt[AMDGPU::BVH_CNT] = true;
     } else {
-      ForceEmitWaitcnt[LOAD_CNT] = false;
-      ForceEmitWaitcnt[SAMPLE_CNT] = false;
-      ForceEmitWaitcnt[BVH_CNT] = false;
+      ForceEmitWaitcnt[AMDGPU::LOAD_CNT] = false;
+      ForceEmitWaitcnt[AMDGPU::SAMPLE_CNT] = false;
+      ForceEmitWaitcnt[AMDGPU::BVH_CNT] = false;
     }
 
-    ForceEmitWaitcnt[VA_VDST] = false;
-    ForceEmitWaitcnt[VM_VSRC] = false;
+    ForceEmitWaitcnt[AMDGPU::VA_VDST] = false;
+    ForceEmitWaitcnt[AMDGPU::VM_VSRC] = false;
 #endif // NDEBUG
   }
 
@@ -662,10 +667,10 @@ class SIInsertWaitcnts {
                          bool ExpertMode) const;
   AtomicRMWState getAtomicRMWState(MachineInstr &MI,
                                    AtomicRMWState PrevState) const;
-  const WaitEventSet &getWaitEvents(InstCounterType T) const {
+  const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const {
     return WCG->getWaitEvents(T);
   }
-  InstCounterType getCounterFromEvent(WaitEventType E) const {
+  AMDGPU::InstCounterType getCounterFromEvent(WaitEventType E) const {
     return WCG->getCounterFromEvent(E);
   }
 };
@@ -705,47 +710,47 @@ class WaitcntBrackets {
   }
 #endif
 
-  bool isSmemCounter(InstCounterType T) const {
-    return T == Context->SmemAccessCounter || T == X_CNT;
+  bool isSmemCounter(AMDGPU::InstCounterType T) const {
+    return T == Context->SmemAccessCounter || T == AMDGPU::X_CNT;
   }
 
-  unsigned getSgprScoresIdx(InstCounterType T) const {
+  unsigned getSgprScoresIdx(AMDGPU::InstCounterType T) const {
     assert(isSmemCounter(T) && "Invalid SMEM counter");
-    return T == X_CNT ? 1 : 0;
+    return T == AMDGPU::X_CNT ? 1 : 0;
   }
 
-  unsigned getOutstanding(InstCounterType T) const {
+  unsigned getOutstanding(AMDGPU::InstCounterType T) const {
     return ScoreUBs[T] - ScoreLBs[T];
   }
 
-  bool hasPendingVMEM(VMEMID ID, InstCounterType T) const {
+  bool hasPendingVMEM(VMEMID ID, AMDGPU::InstCounterType T) const {
     return getVMemScore(ID, T) > getScoreLB(T);
   }
 
   /// \Return true if we have no score entries for counter \p T.
-  bool empty(InstCounterType T) const { return getScoreRange(T) == 0; }
+  bool empty(AMDGPU::InstCounterType T) const { return getScoreRange(T) == 0; }
 
 private:
-  unsigned getScoreLB(InstCounterType T) const {
-    assert(T < NUM_INST_CNTS);
+  unsigned getScoreLB(AMDGPU::InstCounterType T) const {
+    assert(T < AMDGPU::NUM_INST_CNTS);
     return ScoreLBs[T];
   }
 
-  unsigned getScoreUB(InstCounterType T) const {
-    assert(T < NUM_INST_CNTS);
+  unsigned getScoreUB(AMDGPU::InstCounterType T) const {
+    assert(T < AMDGPU::NUM_INST_CNTS);
     return ScoreUBs[T];
   }
 
-  unsigned getScoreRange(InstCounterType T) const {
+  unsigned getScoreRange(AMDGPU::InstCounterType T) const {
     return getScoreUB(T) - getScoreLB(T);
   }
 
-  unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const {
+  unsigned getSGPRScore(MCRegUnit RU, AMDGPU::InstCounterType T) const {
     auto It = SGPRs.find(RU);
     return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(T)] : 0;
   }
 
-  unsigned getVMemScore(VMEMID TID, InstCounterType T) const {
+  unsigned getVMemScore(VMEMID TID, AMDGPU::InstCounterType T) const {
     auto It = VMem.find(TID);
     return It != VMem.end() ? It->second.Scores[T] : 0;
   }
@@ -753,67 +758,68 @@ class WaitcntBrackets {
 public:
   bool merge(const WaitcntBrackets &Other);
 
-  bool counterOutOfOrder(InstCounterType T) const;
+  bool counterOutOfOrder(AMDGPU::InstCounterType T) const;
   void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
     simplifyWaitcnt(Wait, Wait);
   }
   void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
                        AMDGPU::Waitcnt &UpdateWait) const;
-  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+  void simplifyWaitcnt(AMDGPU::InstCounterType T, unsigned &Count) const;
   void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
                     AMDGPU::Waitcnt &UpdateWait) const;
   void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
                       AMDGPU::Waitcnt &UpdateWait) const;
 
-  void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
+  void determineWaitForPhysReg(AMDGPU::InstCounterType T, MCPhysReg Reg,
                                AMDGPU::Waitcnt &Wait) const;
-  void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
+  void determineWaitForLDSDMA(AMDGPU::InstCounterType T, VMEMID TID,
                               AMDGPU::Waitcnt &Wait) const;
   void tryClearSCCWriteEvent(MachineInstr *Inst);
 
   void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
-  void applyWaitcnt(InstCounterType T, unsigned Count);
+  void applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count);
   void updateByEvent(WaitEventType E, MachineInstr &MI);
 
   bool hasPendingEvent() const { return !PendingEvents.empty(); }
   bool hasPendingEvent(WaitEventType E) const {
     return PendingEvents.contains(E);
   }
-  bool hasPendingEvent(InstCounterType T) const {
+  bool hasPendingEvent(AMDGPU::InstCounterType T) const {
     bool HasPending = PendingEvents & Context->getWaitEvents(T);
     assert(HasPending == !empty(T) &&
            "Expected pending events iff scoreboard is not empty");
     return HasPending;
   }
 
-  bool hasMixedPendingEvents(InstCounterType T) const {
+  bool hasMixedPendingEvents(AMDGPU::InstCounterType T) const {
     WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
     // Return true if more than one bit is set in Events.
     return Events.twoOrMore();
   }
 
   bool hasPendingFlat() const {
-    return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
-             LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
-            (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
-             LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
+    return ((LastFlat[AMDGPU::DS_CNT] > ScoreLBs[AMDGPU::DS_CNT] &&
+             LastFlat[AMDGPU::DS_CNT] <= ScoreUBs[AMDGPU::DS_CNT]) ||
+            (LastFlat[AMDGPU::LOAD_CNT] > ScoreLBs[AMDGPU::LOAD_CNT] &&
+             LastFlat[AMDGPU::LOAD_CNT] <= ScoreUBs[AMDGPU::LOAD_CNT]));
   }
 
   void setPendingFlat() {
-    LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
-    LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
+    LastFlat[AMDGPU::LOAD_CNT] = ScoreUBs[AMDGPU::LOAD_CNT];
+    LastFlat[AMDGPU::DS_CNT] = ScoreUBs[AMDGPU::DS_CNT];
   }
 
   bool hasPendingGDS() const {
-    return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
+    return LastGDS > ScoreLBs[AMDGPU::DS_CNT] &&
+           LastGDS <= ScoreUBs[AMDGPU::DS_CNT];
   }
 
   unsigned getPendingGDSWait() const {
-    return std::min(getScoreUB(DS_CNT) - LastGDS,
-                    getWaitCountMax(Context->getLimits(), DS_CNT) - 1);
+    return std::min(getScoreUB(AMDGPU::DS_CNT) - LastGDS,
+                    getWaitCountMax(Context->getLimits(), AMDGPU::DS_CNT) - 1);
   }
 
-  void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
+  void setPendingGDS() { LastGDS = ScoreUBs[AMDGPU::DS_CNT]; }
 
   // Return true if there might be pending writes to the vgpr-interval by VMEM
   // instructions with types different from V.
@@ -837,9 +843,10 @@ class WaitcntBrackets {
   }
 
   void setStateOnFunctionEntryOrReturn() {
-    setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) +
-                              getWaitCountMax(Context->getLimits(), STORE_CNT));
-    PendingEvents |= Context->getWaitEvents(STORE_CNT);
+    setScoreUB(AMDGPU::STORE_CNT,
+               getScoreUB(AMDGPU::STORE_CNT) +
+                   getWaitCountMax(Context->getLimits(), AMDGPU::STORE_CNT));
+    PendingEvents |= Context->getWaitEvents(AMDGPU::STORE_CNT);
   }
 
   ArrayRef<const MachineInstr *> getLDSDMAStores() const {
@@ -865,7 +872,7 @@ class WaitcntBrackets {
     unsigned OtherShift;
   };
 
-  void determineWaitForScore(InstCounterType T, unsigned Score,
+  void determineWaitForScore(AMDGPU::InstCounterType T, unsigned Score,
                              AMDGPU::Waitcnt &Wait) const;
 
   static bool mergeScore(const MergeInfo &M, unsigned &Score,
@@ -882,24 +889,25 @@ class WaitcntBrackets {
     return Context->TRI->regunits(Reg);
   }
 
-  void setScoreLB(InstCounterType T, unsigned Val) {
-    assert(T < NUM_INST_CNTS);
+  void setScoreLB(AMDGPU::InstCounterType T, unsigned Val) {
+    assert(T < AMDGPU::NUM_INST_CNTS);
     ScoreLBs[T] = Val;
   }
 
-  void setScoreUB(InstCounterType T, unsigned Val) {
-    assert(T < NUM_INST_CNTS);
+  void setScoreUB(AMDGPU::InstCounterType T, unsigned Val) {
+    assert(T < AMDGPU::NUM_INST_CNTS);
     ScoreUBs[T] = Val;
 
-    if (T != EXP_CNT)
+    if (T != AMDGPU::EXP_CNT)
       return;
 
-    if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT))
-      ScoreLBs[EXP_CNT] =
-          ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT);
+    if (getScoreRange(AMDGPU::EXP_CNT) > getWaitCountMax(Context->getLimits(), AMDGPU::EXP_CNT))
+      ScoreLBs[AMDGPU::EXP_CNT] =
+          ScoreUBs[AMDGPU::EXP_CNT] -
+          getWaitCountMax(Context->getLimits(), AMDGPU::EXP_CNT);
   }
 
-  void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
+  void setRegScore(MCPhysReg Reg, AMDGPU::InstCounterType T, unsigned Val) {
     const SIRegisterInfo *TRI = Context->TRI;
     if (Reg == AMDGPU::SCC) {
       SCCScore = Val;
@@ -915,20 +923,20 @@ class WaitcntBrackets {
     }
   }
 
-  void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {
+  void setVMemScore(VMEMID TID, AMDGPU::InstCounterType T, unsigned Val) {
     VMem[TID].Scores[T] = Val;
   }
 
-  void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy,
-                         unsigned Val);
+  void setScoreByOperand(const MachineOperand &Op,
+                         AMDGPU::InstCounterType CntTy, unsigned Val);
 
   const SIInsertWaitcnts *Context;
 
-  unsigned ScoreLBs[NUM_INST_CNTS] = {0};
-  unsigned ScoreUBs[NUM_INST_CNTS] = {0};
+  unsigned ScoreLBs[AMDGPU::NUM_INST_CNTS] = {0};
+  unsigned ScoreUBs[AMDGPU::NUM_INST_CNTS] = {0};
   WaitEventSet PendingEvents;
   // Remember the last flat memory operation.
-  unsigned LastFlat[NUM_INST_CNTS] = {0};
+  unsigned LastFlat[AMDGPU::NUM_INST_CNTS] = {0};
   // Remember the last GDS operation.
   unsigned LastGDS = 0;
 
@@ -947,7 +955,7 @@ class WaitcntBrackets {
 
   struct VMEMInfo {
     // Scores for all instruction counters.
-    std::array<unsigned, NUM_INST_CNTS> Scores = {0};
+    std::array<unsigned, AMDGPU::NUM_INST_CNTS> Scores = {0};
     // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
     unsigned VMEMTypes = 0;
 
@@ -1001,7 +1009,8 @@ class SIInsertWaitcntsLegacy : public MachineFunctionPass {
 } // end anonymous namespace
 
 void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
-                                        InstCounterType CntTy, unsigned Score) {
+                                        AMDGPU::InstCounterType CntTy,
+                                        unsigned Score) {
   setRegScore(Op.getReg().asMCReg(), CntTy, Score);
 }
 
@@ -1034,7 +1043,7 @@ bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
 }
 
 void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
-  InstCounterType T = Context->getCounterFromEvent(E);
+  AMDGPU::InstCounterType T = Context->getCounterFromEvent(E);
   assert(T < Context->MaxCounter);
 
   unsigned UB = getScoreUB(T);
@@ -1051,60 +1060,60 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
   const MachineRegisterInfo *MRI = Context->MRI;
   const SIInstrInfo *TII = Context->TII;
 
-  if (T == EXP_CNT) {
+  if (T == AMDGPU::EXP_CNT) {
     // Put score on the source vgprs. If this is a store, just use those
     // specific register(s).
     if (TII->isDS(Inst) && Inst.mayLoadOrStore()) {
       // All GDS operations must protect their address register (same as
       // export.)
       if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
-        setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);
+        setScoreByOperand(*AddrOp, AMDGPU::EXP_CNT, CurrScore);
 
       if (Inst.mayStore()) {
         if (const auto *Data0 =
                 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
-          setScoreByOperand(*Data0, EXP_CNT, CurrScore);
+          setScoreByOperand(*Data0, AMDGPU::EXP_CNT, CurrScore);
         if (const auto *Data1 =
                 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
-          setScoreByOperand(*Data1, EXP_CNT, CurrScore);
+          setScoreByOperand(*Data1, AMDGPU::EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
         for (const MachineOperand &Op : Inst.all_uses()) {
           if (TRI->isVectorRegister(*MRI, Op.getReg()))
-            setScoreByOperand(Op, EXP_CNT, CurrScore);
+            setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
         }
       }
     } else if (TII->isFLAT(Inst)) {
       if (Inst.mayStore()) {
         setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
-                          EXP_CNT, CurrScore);
+                          AMDGPU::EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
         setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
-                          EXP_CNT, CurrScore);
+                          AMDGPU::EXP_CNT, CurrScore);
       }
     } else if (TII->isMIMG(Inst)) {
       if (Inst.mayStore()) {
-        setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
+        setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
         setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
-                          EXP_CNT, CurrScore);
+                          AMDGPU::EXP_CNT, CurrScore);
       }
     } else if (TII->isMTBUF(Inst)) {
       if (Inst.mayStore())
-        setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
+        setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
     } else if (TII->isMUBUF(Inst)) {
       if (Inst.mayStore()) {
-        setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
+        setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
         setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
-                          EXP_CNT, CurrScore);
+                          AMDGPU::EXP_CNT, CurrScore);
       }
     } else if (TII->isLDSDIR(Inst)) {
       // LDSDIR instructions attach the score to the destination.
       setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
-                        EXP_CNT, CurrScore);
+                        AMDGPU::EXP_CNT, CurrScore);
     } else {
       if (TII->isEXP(Inst)) {
         // For export the destination registers are really temps that
@@ -1113,16 +1122,16 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
         // score.
         for (MachineOperand &DefMO : Inst.all_defs()) {
           if (TRI->isVGPR(*MRI, DefMO.getReg())) {
-            setScoreByOperand(DefMO, EXP_CNT, CurrScore);
+            setScoreByOperand(DefMO, AMDGPU::EXP_CNT, CurrScore);
           }
         }
       }
       for (const MachineOperand &Op : Inst.all_uses()) {
         if (TRI->isVectorRegister(*MRI, Op.getReg()))
-          setScoreByOperand(Op, EXP_CNT, CurrScore);
+          setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
       }
     }
-  } else if (T == X_CNT) {
+  } else if (T == AMDGPU::X_CNT) {
     WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
     if (PendingEvents.contains(OtherEvent)) {
       // Hardware inserts an implicit xcnt between interleaved
@@ -1134,12 +1143,12 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
     }
     for (const MachineOperand &Op : Inst.all_uses())
       setScoreByOperand(Op, T, CurrScore);
-  } else if (T == VA_VDST || T == VM_VSRC) {
+  } else if (T == AMDGPU::VA_VDST || T == AMDGPU::VM_VSRC) {
     // Match the score to the VGPR destination or source registers as
     // appropriate
     for (const MachineOperand &Op : Inst.operands()) {
-      if (!Op.isReg() || (T == VA_VDST && Op.isUse()) ||
-          (T == VM_VSRC && Op.isDef()))
+      if (!Op.isReg() || (T == AMDGPU::VA_VDST && Op.isUse()) ||
+          (T == AMDGPU::VM_VSRC && Op.isDef()))
         continue;
       if (TRI->isVectorRegister(*Context->MRI, Op.getReg()))
         setScoreByOperand(Op, T, CurrScore);
@@ -1155,7 +1164,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
     // Special cases where implicit register defs exists, such as M0 or VCC,
     // but none with memory instructions.
     for (const MachineOperand &Op : Inst.defs()) {
-      if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
+      if (T == AMDGPU::LOAD_CNT || T == AMDGPU::SAMPLE_CNT || T == AMDGPU::BVH_CNT) {
         if (!TRI->isVectorRegister(*MRI, Op.getReg())) // TODO: add wrapper
           continue;
         if (updateVMCntOnly(Inst)) {
@@ -1231,41 +1240,41 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
   const GCNSubtarget *ST = Context->ST;
 
   OS << '\n';
-  for (auto T : inst_counter_types(Context->MaxCounter)) {
+  for (auto T : AMDGPU::inst_counter_types(Context->MaxCounter)) {
     unsigned SR = getScoreRange(T);
 
     switch (T) {
-    case LOAD_CNT:
+    case AMDGPU::LOAD_CNT:
       OS << "    " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
          << SR << "):";
       break;
-    case DS_CNT:
+    case AMDGPU::DS_CNT:
       OS << "    " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
          << SR << "):";
       break;
-    case EXP_CNT:
+    case AMDGPU::EXP_CNT:
       OS << "    EXP_CNT(" << SR << "):";
       break;
-    case STORE_CNT:
+    case AMDGPU::STORE_CNT:
       OS << "    " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
          << SR << "):";
       break;
-    case SAMPLE_CNT:
+    case AMDGPU::SAMPLE_CNT:
       OS << "    SAMPLE_CNT(" << SR << "):";
       break;
-    case BVH_CNT:
+    case AMDGPU::BVH_CNT:
       OS << "    BVH_CNT(" << SR << "):";
       break;
-    case KM_CNT:
+    case AMDGPU::KM_CNT:
       OS << "    KM_CNT(" << SR << "):";
       break;
-    case X_CNT:
+    case AMDGPU::X_CNT:
       OS << "    X_CNT(" << SR << "):";
       break;
-    case VA_VDST:
+    case AMDGPU::VA_VDST:
       OS << "    VA_VDST(" << SR << "): ";
       break;
-    case VM_VSRC:
+    case AMDGPU::VM_VSRC:
       OS << "    VM_VSRC(" << SR << "): ";
       break;
     default:
@@ -1307,7 +1316,7 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
         }
       }
 
-      if (T == KM_CNT && SCCScore > 0)
+      if (T == AMDGPU::KM_CNT && SCCScore > 0)
         OS << ' ' << SCCScore << ":scc";
     }
     OS << '\n';
@@ -1333,19 +1342,19 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
 /// current WaitcntBrackets and any other waits specified in \p CheckWait.
 void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
                                       AMDGPU::Waitcnt &UpdateWait) const {
-  simplifyWaitcnt(LOAD_CNT, UpdateWait.LoadCnt);
-  simplifyWaitcnt(EXP_CNT, UpdateWait.ExpCnt);
-  simplifyWaitcnt(DS_CNT, UpdateWait.DsCnt);
-  simplifyWaitcnt(STORE_CNT, UpdateWait.StoreCnt);
-  simplifyWaitcnt(SAMPLE_CNT, UpdateWait.SampleCnt);
-  simplifyWaitcnt(BVH_CNT, UpdateWait.BvhCnt);
-  simplifyWaitcnt(KM_CNT, UpdateWait.KmCnt);
+  simplifyWaitcnt(AMDGPU::LOAD_CNT, UpdateWait.LoadCnt);
+  simplifyWaitcnt(AMDGPU::EXP_CNT, UpdateWait.ExpCnt);
+  simplifyWaitcnt(AMDGPU::DS_CNT, UpdateWait.DsCnt);
+  simplifyWaitcnt(AMDGPU::STORE_CNT, UpdateWait.StoreCnt);
+  simplifyWaitcnt(AMDGPU::SAMPLE_CNT, UpdateWait.SampleCnt);
+  simplifyWaitcnt(AMDGPU::BVH_CNT, UpdateWait.BvhCnt);
+  simplifyWaitcnt(AMDGPU::KM_CNT, UpdateWait.KmCnt);
   simplifyXcnt(CheckWait, UpdateWait);
-  simplifyWaitcnt(VA_VDST, UpdateWait.VaVdst);
+  simplifyWaitcnt(AMDGPU::VA_VDST, UpdateWait.VaVdst);
   simplifyVmVsrc(CheckWait, UpdateWait);
 }
 
-void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
+void WaitcntBrackets::simplifyWaitcnt(AMDGPU::InstCounterType T,
                                       unsigned &Count) const {
   // The number of outstanding events for this type, T, can be calculated
   // as (UB - LB). If the current Count is greater than or equal to the number
@@ -1370,9 +1379,9 @@ void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
   // stores. VMEM loads retun in order, so if we only have loads XCnt is
   // decremented to the same number as LOADCnt.
   if (CheckWait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
-      !hasPendingEvent(STORE_CNT) && CheckWait.XCnt >= CheckWait.LoadCnt)
+      !hasPendingEvent(AMDGPU::STORE_CNT) && CheckWait.XCnt >= CheckWait.LoadCnt)
     UpdateWait.XCnt = ~0u;
-  simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
+  simplifyWaitcnt(AMDGPU::X_CNT, UpdateWait.XCnt);
 }
 
 void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
@@ -1384,7 +1393,7 @@ void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
       std::min({CheckWait.LoadCnt, CheckWait.StoreCnt, CheckWait.SampleCnt,
                 CheckWait.BvhCnt, CheckWait.DsCnt}))
     UpdateWait.VmVsrc = ~0u;
-  simplifyWaitcnt(VM_VSRC, UpdateWait.VmVsrc);
+  simplifyWaitcnt(AMDGPU::VM_VSRC, UpdateWait.VmVsrc);
 }
 
 void WaitcntBrackets::purgeEmptyTrackingData() {
@@ -1398,7 +1407,7 @@ void WaitcntBrackets::purgeEmptyTrackingData() {
   }
 }
 
-void WaitcntBrackets::determineWaitForScore(InstCounterType T,
+void WaitcntBrackets::determineWaitForScore(AMDGPU::InstCounterType T,
                                             unsigned ScoreToWait,
                                             AMDGPU::Waitcnt &Wait) const {
   const unsigned LB = getScoreLB(T);
@@ -1406,7 +1415,7 @@ void WaitcntBrackets::determineWaitForScore(InstCounterType T,
 
   // If the score falls within the bracket, we need a waitcnt.
   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
-    if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
+    if ((T == AMDGPU::LOAD_CNT || T == AMDGPU::DS_CNT) && hasPendingFlat() &&
         !Context->ST->hasFlatLgkmVMemCountInOrder()) {
       // If there is a pending FLAT operation, and this is a VMem or LGKM
       // waitcnt and the target can report early completion, then we need
@@ -1427,7 +1436,8 @@ void WaitcntBrackets::determineWaitForScore(InstCounterType T,
   }
 }
 
-void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
+void WaitcntBrackets::determineWaitForPhysReg(AMDGPU::InstCounterType T,
+                                              MCPhysReg Reg,
                                               AMDGPU::Waitcnt &Wait) const {
   if (Reg == AMDGPU::SCC) {
     determineWaitForScore(T, SCCScore, Wait);
@@ -1440,7 +1450,8 @@ void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
   }
 }
 
-void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
+void WaitcntBrackets::determineWaitForLDSDMA(AMDGPU::InstCounterType T,
+                                             VMEMID TID,
                                              AMDGPU::Waitcnt &Wait) const {
   assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
   determineWaitForScore(T, getVMemScore(TID, T), Wait);
@@ -1454,9 +1465,9 @@ void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
       PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
     WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
     // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
-    if ((PendingEvents & Context->getWaitEvents(KM_CNT)) ==
+    if ((PendingEvents & Context->getWaitEvents(AMDGPU::KM_CNT)) ==
         SCC_WRITE_PendingEvent) {
-      setScoreLB(KM_CNT, getScoreUB(KM_CNT));
+      setScoreLB(AMDGPU::KM_CNT, getScoreUB(AMDGPU::KM_CNT));
     }
 
     PendingEvents.remove(SCC_WRITE_PendingEvent);
@@ -1465,19 +1476,19 @@ void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
 }
 
 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
-  applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
-  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
-  applyWaitcnt(DS_CNT, Wait.DsCnt);
-  applyWaitcnt(STORE_CNT, Wait.StoreCnt);
-  applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
-  applyWaitcnt(BVH_CNT, Wait.BvhCnt);
-  applyWaitcnt(KM_CNT, Wait.KmCnt);
-  applyWaitcnt(X_CNT, Wait.XCnt);
-  applyWaitcnt(VA_VDST, Wait.VaVdst);
-  applyWaitcnt(VM_VSRC, Wait.VmVsrc);
+  applyWaitcnt(AMDGPU::LOAD_CNT, Wait.LoadCnt);
+  applyWaitcnt(AMDGPU::EXP_CNT, Wait.ExpCnt);
+  applyWaitcnt(AMDGPU::DS_CNT, Wait.DsCnt);
+  applyWaitcnt(AMDGPU::STORE_CNT, Wait.StoreCnt);
+  applyWaitcnt(AMDGPU::SAMPLE_CNT, Wait.SampleCnt);
+  applyWaitcnt(AMDGPU::BVH_CNT, Wait.BvhCnt);
+  applyWaitcnt(AMDGPU::KM_CNT, Wait.KmCnt);
+  applyWaitcnt(AMDGPU::X_CNT, Wait.XCnt);
+  applyWaitcnt(AMDGPU::VA_VDST, Wait.VaVdst);
+  applyWaitcnt(AMDGPU::VM_VSRC, Wait.VmVsrc);
 }
 
-void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
+void WaitcntBrackets::applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count) {
   const unsigned UB = getScoreUB(T);
   if (Count >= UB)
     return;
@@ -1490,16 +1501,16 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
     PendingEvents.remove(Context->getWaitEvents(T));
   }
 
-  if (T == KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
-    if (!hasMixedPendingEvents(X_CNT))
-      applyWaitcnt(X_CNT, 0);
+  if (T == AMDGPU::KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
+    if (!hasMixedPendingEvents(AMDGPU::X_CNT))
+      applyWaitcnt(AMDGPU::X_CNT, 0);
     else
       PendingEvents.remove(SMEM_GROUP);
   }
-  if (T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
-      !hasPendingEvent(STORE_CNT)) {
-    if (!hasMixedPendingEvents(X_CNT))
-      applyWaitcnt(X_CNT, Count);
+  if (T == AMDGPU::LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
+      !hasPendingEvent(AMDGPU::STORE_CNT)) {
+    if (!hasMixedPendingEvents(AMDGPU::X_CNT))
+      applyWaitcnt(AMDGPU::X_CNT, Count);
     else if (Count == 0)
       PendingEvents.remove(VMEM_GROUP);
   }
@@ -1507,16 +1518,16 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
 
 // Where there are multiple types of event in the bracket of a counter,
 // the decrement may go out of order.
-bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
+bool WaitcntBrackets::counterOutOfOrder(AMDGPU::InstCounterType T) const {
   // Scalar memory read always can go out of order.
   if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
-      (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
+      (T == AMDGPU::X_CNT && hasPendingEvent(SMEM_GROUP)))
     return true;
 
   // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
   // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
   // out-of-order completion.
-  if (T == LOAD_CNT) {
+  if (T == AMDGPU::LOAD_CNT) {
     unsigned Events = hasPendingEvent(T);
     // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
     // events
@@ -1560,24 +1571,25 @@ static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
 
 /// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
 /// and if so, which counter it is waiting on.
-static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
+static std::optional<AMDGPU::InstCounterType>
+counterTypeForInstr(unsigned Opcode) {
   switch (Opcode) {
   case AMDGPU::S_WAIT_LOADCNT:
-    return LOAD_CNT;
+    return AMDGPU::LOAD_CNT;
   case AMDGPU::S_WAIT_EXPCNT:
-    return EXP_CNT;
+    return AMDGPU::EXP_CNT;
   case AMDGPU::S_WAIT_STORECNT:
-    return STORE_CNT;
+    return AMDGPU::STORE_CNT;
   case AMDGPU::S_WAIT_SAMPLECNT:
-    return SAMPLE_CNT;
+    return AMDGPU::SAMPLE_CNT;
   case AMDGPU::S_WAIT_BVHCNT:
-    return BVH_CNT;
+    return AMDGPU::BVH_CNT;
   case AMDGPU::S_WAIT_DSCNT:
-    return DS_CNT;
+    return AMDGPU::DS_CNT;
   case AMDGPU::S_WAIT_KMCNT:
-    return KM_CNT;
+    return AMDGPU::KM_CNT;
   case AMDGPU::S_WAIT_XCNT:
-    return X_CNT;
+    return AMDGPU::X_CNT;
   default:
     return {};
   }
@@ -1644,7 +1656,7 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
       assert(ST.hasVMemToLDSLoad());
       LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
                         << "Before: " << Wait << '\n';);
-      ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN, Wait);
+      ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, LDSDMA_BEGIN, Wait);
       LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
 
       // It is possible (but unlikely) that this is the only wait instruction,
@@ -1661,7 +1673,8 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
       unsigned OldVSCnt =
           TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
       if (TrySimplify)
-        ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
+        ScoreBrackets.simplifyWaitcnt(AMDGPU::InstCounterType::STORE_CNT,
+                                      OldVSCnt);
       Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
 
       if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
@@ -1677,9 +1690,9 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
                                          AMDGPU::encodeWaitcnt(IV, Wait));
     Modified |= promoteSoftWaitCnt(WaitcntInstr);
 
-    ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
-    ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
-    ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
+    ScoreBrackets.applyWaitcnt(AMDGPU::LOAD_CNT, Wait.LoadCnt);
+    ScoreBrackets.applyWaitcnt(AMDGPU::EXP_CNT, Wait.ExpCnt);
+    ScoreBrackets.applyWaitcnt(AMDGPU::DS_CNT, Wait.DsCnt);
     Wait.LoadCnt = ~0u;
     Wait.ExpCnt = ~0u;
     Wait.DsCnt = ~0u;
@@ -1697,7 +1710,7 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
                                          AMDGPU::OpName::simm16, Wait.StoreCnt);
     Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
 
-    ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
+    ScoreBrackets.applyWaitcnt(AMDGPU::STORE_CNT, Wait.StoreCnt);
     Wait.StoreCnt = ~0u;
 
     LLVM_DEBUG(It.isEnd()
@@ -1742,7 +1755,7 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
       // If so, fall back to normal (non-expanded) behavior since expansion
       // would provide misleading profiling information.
       bool AnyOutOfOrder = false;
-      for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
+      for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
         unsigned WaitCnt = Wait.get(CT);
         if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
           AnyOutOfOrder = true;
@@ -1757,7 +1770,7 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
         Modified = true;
       } else {
         // All counters are in-order, safe to expand
-        for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
+        for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
           unsigned WaitCnt = Wait.get(CT);
           if (WaitCnt == ~0u)
             continue;
@@ -1789,11 +1802,11 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
     assert(ST.hasVscnt());
 
     if (ExpandWaitcntProfiling && Wait.StoreCnt != ~0u &&
-        !ScoreBrackets.counterOutOfOrder(STORE_CNT)) {
+        !ScoreBrackets.counterOutOfOrder(AMDGPU::STORE_CNT)) {
       // Only expand if counter is not out-of-order
       unsigned Outstanding =
-          std::min(ScoreBrackets.getOutstanding(STORE_CNT),
-                   getWaitCountMax(getLimits(), STORE_CNT) - 1);
+          std::min(ScoreBrackets.getOutstanding(AMDGPU::STORE_CNT),
+                   getWaitCountMax(getLimits(), AMDGPU::STORE_CNT) - 1);
       EmitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) {
         BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
             .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
@@ -1840,7 +1853,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
   MachineInstr *CombinedLoadDsCntInstr = nullptr;
   MachineInstr *CombinedStoreDsCntInstr = nullptr;
   MachineInstr *WaitcntDepctrInstr = nullptr;
-  MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
+  MachineInstr *WaitInstrs[AMDGPU::NUM_EXTENDED_INST_CNTS] = {};
 
   LLVM_DEBUG({
     dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
@@ -1908,7 +1921,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       II.eraseFromParent();
       continue;
     } else {
-      std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
+      std::optional<AMDGPU::InstCounterType> CT = counterTypeForInstr(Opcode);
       assert(CT.has_value());
       unsigned OldCnt =
           TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
@@ -1967,8 +1980,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
                                            AMDGPU::OpName::simm16, NewEnc);
       Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
-      ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
-      ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
+      ScoreBrackets.applyWaitcnt(AMDGPU::LOAD_CNT, Wait.LoadCnt);
+      ScoreBrackets.applyWaitcnt(AMDGPU::DS_CNT, Wait.DsCnt);
       Wait.LoadCnt = ~0u;
       Wait.DsCnt = ~0u;
 
@@ -1991,8 +2004,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
                                            AMDGPU::OpName::simm16, NewEnc);
       Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
-      ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
-      ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
+      ScoreBrackets.applyWaitcnt(AMDGPU::STORE_CNT, Wait.StoreCnt);
+      ScoreBrackets.applyWaitcnt(AMDGPU::DS_CNT, Wait.DsCnt);
       Wait.StoreCnt = ~0u;
       Wait.DsCnt = ~0u;
 
@@ -2024,11 +2037,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
     // individual wait count instructions for these.
 
     if (Wait.LoadCnt != ~0u) {
-      WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
-      WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
+      WaitsToErase.push_back(&WaitInstrs[AMDGPU::LOAD_CNT]);
+      WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
     } else if (Wait.StoreCnt != ~0u) {
-      WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
-      WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
+      WaitsToErase.push_back(&WaitInstrs[AMDGPU::STORE_CNT]);
+      WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
     }
 
     for (MachineInstr **WI : WaitsToErase) {
@@ -2041,7 +2054,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
     }
   }
 
-  for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+  for (auto CT : inst_counter_types(AMDGPU::NUM_EXTENDED_INST_CNTS)) {
     if (!WaitInstrs[CT])
       continue;
 
@@ -2076,8 +2089,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
     Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, Wait.VmVsrc);
     Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
 
-    ScoreBrackets.applyWaitcnt(VA_VDST, Wait.VaVdst);
-    ScoreBrackets.applyWaitcnt(VM_VSRC, Wait.VmVsrc);
+    ScoreBrackets.applyWaitcnt(AMDGPU::VA_VDST, Wait.VaVdst);
+    ScoreBrackets.applyWaitcnt(AMDGPU::VM_VSRC, Wait.VmVsrc);
     Wait.VaVdst = ~0u;
     Wait.VmVsrc = ~0u;
 
@@ -2123,7 +2136,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
   // For GFX12+, we use separate wait instructions, which makes expansion
   // simpler
   if (ExpandWaitcntProfiling) {
-    for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+    for (auto CT : inst_counter_types(AMDGPU::NUM_EXTENDED_INST_CNTS)) {
       unsigned Count = Wait.get(CT);
       if (Count == ~0u)
         continue;
@@ -2181,7 +2194,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
   // Generate an instruction for any remaining counter that needs
   // waiting for.
 
-  for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+  for (auto CT : inst_counter_types(AMDGPU::NUM_EXTENDED_INST_CNTS)) {
     unsigned Count = Wait.get(CT);
     if (Count == ~0u)
       continue;
@@ -2281,7 +2294,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
     // to send a message to explicitly release all VGPRs before the stores have
     // completed, but it is only safe to do this if there are no outstanding
     // scratch stores.
-    EndPgmInsts[&MI] = !ScoreBrackets.empty(STORE_CNT) &&
+    EndPgmInsts[&MI] = !ScoreBrackets.empty(AMDGPU::STORE_CNT) &&
                        !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
     break;
   }
@@ -2316,7 +2329,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
     // Wait for any pending GDS instruction to complete before any
     // "Always GDS" instruction.
     if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
-      addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
+      addWait(Wait, AMDGPU::DS_CNT, ScoreBrackets.getPendingGDSWait());
 
     if (MI.isCall()) {
       // The function is going to insert a wait on everything in its prolog.
@@ -2378,18 +2391,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
               if ((I + 1) >= NUM_LDSDMA) {
                 // We didn't have enough slot to track this LDS DMA store, it
                 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
-                ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
+                ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID, Wait);
                 break;
               }
 
-              ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID + I + 1, Wait);
+              ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT,
+                                                   TID + I + 1, Wait);
             }
           }
         } else {
-          ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
+          ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID, Wait);
         }
         if (Memop->isStore()) {
-          ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID, Wait);
+          ScoreBrackets.determineWaitForLDSDMA(AMDGPU::EXP_CNT, TID, Wait);
         }
       }
 
@@ -2414,9 +2428,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
           if (Op.isImplicit() && MI.mayLoadOrStore())
             continue;
 
-          ScoreBrackets.determineWaitForPhysReg(VA_VDST, Reg, Wait);
+          ScoreBrackets.determineWaitForPhysReg(AMDGPU::VA_VDST, Reg, Wait);
           if (Op.isDef())
-            ScoreBrackets.determineWaitForPhysReg(VM_VSRC, Reg, Wait);
+            ScoreBrackets.determineWaitForPhysReg(AMDGPU::VM_VSRC, Reg, Wait);
           // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
           // previous write and this write are the same type of VMEM
           // instruction, in which case they are (in some architectures)
@@ -2427,24 +2441,25 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
               ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
               ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
               !ST->hasVmemWriteVgprInOrder()) {
-            ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait);
-            ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait);
-            ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait);
+            ScoreBrackets.determineWaitForPhysReg(AMDGPU::LOAD_CNT, Reg, Wait);
+            ScoreBrackets.determineWaitForPhysReg(AMDGPU::SAMPLE_CNT, Reg,
+                                                  Wait);
+            ScoreBrackets.determineWaitForPhysReg(AMDGPU::BVH_CNT, Reg, Wait);
             ScoreBrackets.clearVgprVmemTypes(Reg);
           }
 
           if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
-            ScoreBrackets.determineWaitForPhysReg(EXP_CNT, Reg, Wait);
+            ScoreBrackets.determineWaitForPhysReg(AMDGPU::EXP_CNT, Reg, Wait);
           }
-          ScoreBrackets.determineWaitForPhysReg(DS_CNT, Reg, Wait);
+          ScoreBrackets.determineWaitForPhysReg(AMDGPU::DS_CNT, Reg, Wait);
         } else if (Op.getReg() == AMDGPU::SCC) {
-          ScoreBrackets.determineWaitForPhysReg(KM_CNT, Reg, Wait);
+          ScoreBrackets.determineWaitForPhysReg(AMDGPU::KM_CNT, Reg, Wait);
         } else {
           ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait);
         }
 
         if (ST->hasWaitXcnt() && Op.isDef())
-          ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait);
+          ScoreBrackets.determineWaitForPhysReg(AMDGPU::X_CNT, Reg, Wait);
       }
     }
   }
@@ -2489,7 +2504,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   // XCnt if the current instruction is of VMEM type and has a memory
   // dependency with another VMEM instruction in flight.
   if (Wait.XCnt != ~0u && isVmemAccess(MI)) {
-    ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);
+    ScoreBrackets.applyWaitcnt(AMDGPU::X_CNT, Wait.XCnt);
     Wait.XCnt = ~0u;
   }
 
@@ -2499,18 +2514,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
     Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
 
   // If we force waitcnt then update Wait accordingly.
-  for (InstCounterType T : inst_counter_types()) {
+  for (AMDGPU::InstCounterType T : AMDGPU::inst_counter_types()) {
     if (!ForceEmitWaitcnt[T])
       continue;
     Wait.set(T, 0);
   }
 
   if (FlushFlags.FlushVmCnt) {
-    for (InstCounterType T : {LOAD_CNT, SAMPLE_CNT, BVH_CNT})
+    for (AMDGPU::InstCounterType T :
+         {AMDGPU::LOAD_CNT, AMDGPU::SAMPLE_CNT, AMDGPU::BVH_CNT})
       Wait.set(T, 0);
   }
 
-  if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
+  if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
     Wait.DsCnt = 0;
 
   if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
@@ -2543,7 +2559,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
       Modified = true;
     }
     // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
-    ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
+    ScoreBrackets.applyWaitcnt(AMDGPU::EXP_CNT, Wait.ExpCnt);
     Wait.ExpCnt = ~0u;
 
     LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
@@ -2738,7 +2754,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
     ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
   } else if (TII->isVINTERP(Inst)) {
     int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
-    ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
+    ScoreBrackets->applyWaitcnt(AMDGPU::EXP_CNT, Imm);
   } else if (SIInstrInfo::isEXP(Inst)) {
     unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
     if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
@@ -2826,10 +2842,10 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
 
     StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
 
-    if (T == DS_CNT)
+    if (T == AMDGPU::DS_CNT)
       StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
 
-    if (T == KM_CNT) {
+    if (T == AMDGPU::KM_CNT) {
       StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
       if (Other.hasPendingEvent(SCC_WRITE)) {
         if (!OldEvents.contains(SCC_WRITE)) {
@@ -3073,14 +3089,14 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
   if (Block.getFirstTerminator() == Block.end()) {
     PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
     if (FlushFlags.FlushVmCnt) {
-      if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
+      if (ScoreBrackets.hasPendingEvent(AMDGPU::LOAD_CNT))
         Wait.LoadCnt = 0;
-      if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
+      if (ScoreBrackets.hasPendingEvent(AMDGPU::SAMPLE_CNT))
         Wait.SampleCnt = 0;
-      if (ScoreBrackets.hasPendingEvent(BVH_CNT))
+      if (ScoreBrackets.hasPendingEvent(AMDGPU::BVH_CNT))
         Wait.BvhCnt = 0;
     }
-    if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
+    if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
       Wait.DsCnt = 0;
   }
 
@@ -3212,14 +3228,14 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
           // Check if this register has a pending VMEM load from outside the
           // loop (value loaded outside and used inside).
           VMEMID ID = toVMEMID(RU);
-          if (Brackets.hasPendingVMEM(ID, LOAD_CNT) ||
-              Brackets.hasPendingVMEM(ID, SAMPLE_CNT) ||
-              Brackets.hasPendingVMEM(ID, BVH_CNT))
+          if (Brackets.hasPendingVMEM(ID, AMDGPU::LOAD_CNT) ||
+              Brackets.hasPendingVMEM(ID, AMDGPU::SAMPLE_CNT) ||
+              Brackets.hasPendingVMEM(ID, AMDGPU::BVH_CNT))
             UsesVgprLoadedOutsideVMEM = true;
           // Check if loaded outside the loop via DS (not VMEM/FLAT).
           // Only consider it a DS load if there's no pending VMEM load for
           // this register, since FLAT can set both counters.
-          else if (Brackets.hasPendingVMEM(ID, DS_CNT))
+          else if (Brackets.hasPendingVMEM(ID, AMDGPU::DS_CNT))
             UsesVgprLoadedOutsideDS = true;
         }
       }
@@ -3323,18 +3339,19 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
                         : MF.getFunction()
                               .getFnAttribute("amdgpu-expert-scheduling-mode")
                               .getValueAsBool());
-    MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
+    MaxCounter = IsExpertMode ? AMDGPU::NUM_EXPERT_INST_CNTS
+                              : AMDGPU::NUM_EXTENDED_INST_CNTS;
     if (!WCG)
       WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, &Limits,
                                                         IsExpertMode);
   } else {
-    MaxCounter = NUM_NORMAL_INST_CNTS;
+    MaxCounter = AMDGPU::NUM_NORMAL_INST_CNTS;
     if (!WCG)
-      WCG = std::make_unique<WaitcntGeneratorPreGFX12>(MF, NUM_NORMAL_INST_CNTS,
-                                                       &Limits);
+      WCG = std::make_unique<WaitcntGeneratorPreGFX12>(
+          MF, AMDGPU::NUM_NORMAL_INST_CNTS, &Limits);
   }
 
-  for (auto T : inst_counter_types())
+  for (auto T : AMDGPU::inst_counter_types())
     ForceEmitWaitcnt[T] = false;
 
   SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);
@@ -3358,12 +3375,14 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
     if (ST->hasExtendedWaitCounts()) {
       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
           .addImm(0);
-      for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
-        if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
+      for (auto CT : inst_counter_types(AMDGPU::NUM_EXTENDED_INST_CNTS)) {
+        if (CT == AMDGPU::LOAD_CNT || CT == AMDGPU::DS_CNT ||
+            CT == AMDGPU::STORE_CNT || CT == AMDGPU::X_CNT)
           continue;
 
         if (!ST->hasImageInsts() &&
-            (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
+            (CT == AMDGPU::EXP_CNT || CT == AMDGPU::SAMPLE_CNT ||
+             CT == AMDGPU::BVH_CNT))
           continue;
 
         BuildMI(EntryBB, I, DebugLoc(),