[llvm] [AMDGPU] Factor out common code from SIInsertWaitcnts (PR #83018)

Mon Feb 26 07:44:07 PST 2024

https://github.com/cdevadas created https://github.com/llvm/llvm-project/pull/83018

SIInsertWaitcnts pass inserts various waitcounts required for operands of memory operations. For a subtarget, a new waitcount insertion should be attempted post Hazard Recognizer that comes later in the pipeline than where SIInsertWaitcnts is currently placed.

Factoring out the common code into Utils/AMDGPUWaitCountUtils so that most of the code can be used by the new waitcnt insertion pass as well.

>From bd4843eb882bfb680b946a1a80d7afa41f6b9711 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Mon, 26 Feb 2024 20:11:56 +0530
Subject: [PATCH] [AMDGPU] Factor out common code from SIInsertWaitcnts

SIInsertWaitcnts pass inserts various waitcounts required for
operands of memory operations. For a subtarget, a new waitcount
insertion should be attempted post Hazard Recognizer that comes
later in the pipeline than where SIInsertWaitcnts is currently
placed.

Factoring out the common code into Utils/AMDGPUWaitCountUtils
so that most of the code can be used by the new waitcnt
insertion pass as well.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   | 2503 +++--------------
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |    3 +-
 .../AMDGPU/Utils/AMDGPUWaitCountUtils.cpp     | 1393 +++++++++
 .../AMDGPU/Utils/AMDGPUWaitCountUtils.h       |  531 ++++
 llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt   |    1 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   10 +-
 6 files changed, 2289 insertions(+), 2152 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/Utils/AMDGPUWaitCountUtils.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/Utils/AMDGPUWaitCountUtils.h

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index a6184c5e1e0487..19d5ae17d3ec17 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -28,6 +28,7 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDGPUWaitCountUtils.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
@@ -38,6 +39,7 @@
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/TargetParser/TargetParser.h"
 using namespace llvm;
+using namespace llvm::AMDGPU;
 
 #define DEBUG_TYPE "si-insert-waitcnts"
 
@@ -53,1540 +55,229 @@ static cl::opt<bool> ForceEmitZeroFlag(
   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
   cl::init(false), cl::Hidden);
 
-namespace {
-// Class of object that encapsulates latest instruction counter score
-// associated with the operand.  Used for determining whether
-// s_waitcnt instruction needs to be emitted.
-
-enum InstCounterType {
-  LOAD_CNT = 0, // VMcnt prior to gfx12.
-  DS_CNT,       // LKGMcnt prior to gfx12.
-  EXP_CNT,      //
-  STORE_CNT,    // VScnt in gfx10/gfx11.
-  NUM_NORMAL_INST_CNTS,
-  SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
-  BVH_CNT,                           // gfx12+ only.
-  KM_CNT,                            // gfx12+ only.
-  NUM_EXTENDED_INST_CNTS,
-  NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
-};
-} // namespace
-
-namespace llvm {
-template <> struct enum_iteration_traits<InstCounterType> {
-  static constexpr bool is_iterable = true;
-};
-} // namespace llvm
-
-namespace {
-// Return an iterator over all counters between LOAD_CNT (the first counter)
-// and \c MaxCounter (exclusive, default value yields an enumeration over
-// all counters).
-auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
-  return enum_seq(LOAD_CNT, MaxCounter);
-}
-
-using RegInterval = std::pair<int, int>;
-
-struct HardwareLimits {
-  unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
-  unsigned ExpcntMax;
-  unsigned DscntMax;     // Corresponds to LGKMcnt prior to gfx12.
-  unsigned StorecntMax;  // Corresponds to VScnt in gfx10/gfx11.
-  unsigned SamplecntMax; // gfx12+ only.
-  unsigned BvhcntMax;    // gfx12+ only.
-  unsigned KmcntMax;     // gfx12+ only.
-};
-
-struct RegisterEncoding {
-  unsigned VGPR0;
-  unsigned VGPRL;
-  unsigned SGPR0;
-  unsigned SGPRL;
-};
-
-enum WaitEventType {
-  VMEM_ACCESS,              // vector-memory read & write
-  VMEM_READ_ACCESS,         // vector-memory read
-  VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
-  VMEM_BVH_READ_ACCESS,     // vector-memory BVH read (gfx12+ only)
-  VMEM_WRITE_ACCESS,        // vector-memory write that is not scratch
-  SCRATCH_WRITE_ACCESS,     // vector-memory write that may be scratch
-  LDS_ACCESS,               // lds read & write
-  GDS_ACCESS,               // gds read & write
-  SQ_MESSAGE,               // send message
-  SMEM_ACCESS,              // scalar-memory read & write
-  EXP_GPR_LOCK,             // export holding on its data src
-  GDS_GPR_LOCK,             // GDS holding on its data and addr src
-  EXP_POS_ACCESS,           // write to export position
-  EXP_PARAM_ACCESS,         // write to export parameter
-  VMW_GPR_LOCK,             // vector-memory write holding on its data src
-  EXP_LDS_ACCESS,           // read by ldsdir counting as export
-  NUM_WAIT_EVENTS,
-};
-
-// The mapping is:
-//  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
-//  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
-//  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
-// We reserve a fixed number of VGPR slots in the scoring tables for
-// special tokens like SCMEM_LDS (needed for buffer load to LDS).
-enum RegisterMapping {
-  SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
-  AGPR_OFFSET = 256,      // Maximum programmable ArchVGPRs across all targets.
-  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
-  NUM_EXTRA_VGPRS = 9,    // Reserved slots for DS.
-  // Artificial register slots to track LDS writes into specific LDS locations
-  // if a location is known. When slots are exhausted or location is
-  // unknown use the first slot. The first slot is also always updated in
-  // addition to known location's slot to properly generate waits if dependent
-  // instruction's location is unknown.
-  EXTRA_VGPR_LDS = 0,
-  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
-};
-
-// Enumerate different types of result-returning VMEM operations. Although
-// s_waitcnt orders them all with a single vmcnt counter, in the absence of
-// s_waitcnt only instructions of the same VmemType are guaranteed to write
-// their results in order -- so there is no need to insert an s_waitcnt between
-// two instructions of the same type that write the same vgpr.
-enum VmemType {
-  // BUF instructions and MIMG instructions without a sampler.
-  VMEM_NOSAMPLER,
-  // MIMG instructions with a sampler.
-  VMEM_SAMPLER,
-  // BVH instructions
-  VMEM_BVH,
-  NUM_VMEM_TYPES
-};
-
-// Maps values of InstCounterType to the instruction that waits on that
-// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
-// returns true.
-static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
-    AMDGPU::S_WAIT_LOADCNT,  AMDGPU::S_WAIT_DSCNT,     AMDGPU::S_WAIT_EXPCNT,
-    AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
-    AMDGPU::S_WAIT_KMCNT};
-
-static bool updateVMCntOnly(const MachineInstr &Inst) {
-  return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
-         SIInstrInfo::isFLATScratch(Inst);
-}
-
-#ifndef NDEBUG
-static bool isNormalMode(InstCounterType MaxCounter) {
-  return MaxCounter == NUM_NORMAL_INST_CNTS;
-}
-#endif // NDEBUG
-
-VmemType getVmemType(const MachineInstr &Inst) {
-  assert(updateVMCntOnly(Inst));
-  if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
-      !SIInstrInfo::isVSAMPLE(Inst))
-    return VMEM_NOSAMPLER;
-  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
-  const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
-      AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
-  return BaseInfo->BVH ? VMEM_BVH
-                       : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
-}
-
-unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
-  switch (T) {
-  case LOAD_CNT:
-    return Wait.LoadCnt;
-  case EXP_CNT:
-    return Wait.ExpCnt;
-  case DS_CNT:
-    return Wait.DsCnt;
-  case STORE_CNT:
-    return Wait.StoreCnt;
-  case SAMPLE_CNT:
-    return Wait.SampleCnt;
-  case BVH_CNT:
-    return Wait.BvhCnt;
-  case KM_CNT:
-    return Wait.KmCnt;
-  default:
-    llvm_unreachable("bad InstCounterType");
-  }
-}
-
-void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
-  unsigned &WC = getCounterRef(Wait, T);
-  WC = std::min(WC, Count);
-}
-
-void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
-  getCounterRef(Wait, T) = ~0u;
-}
-
-unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
-  return getCounterRef(Wait, T);
-}
-
-// Mapping from event to counter according to the table masks.
-InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
-  for (auto T : inst_counter_types()) {
-    if (masks[T] & (1 << E))
-      return T;
-  }
-  llvm_unreachable("event type has no associated counter");
-}
+//===----------------------------------------------------------------------===//
+// SIWaitCntsInserter helper class interface.
+//===----------------------------------------------------------------------===//
 
-// This objects maintains the current score brackets of each wait counter, and
-// a per-register scoreboard for each wait counter.
-//
-// We also maintain the latest score for every event type that can change the
-// waitcnt in order to know if there are multiple types of events within
-// the brackets. When multiple types of event happen in the bracket,
-// wait count may get decreased out of order, therefore we need to put in
-// "s_waitcnt 0" before use.
-class WaitcntBrackets {
+class SIWaitCntsInserter : public AMDGPUWaitCntInserter {
 public:
-  WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
-                  HardwareLimits Limits, RegisterEncoding Encoding,
-                  const unsigned *WaitEventMaskForInst,
-                  InstCounterType SmemAccessCounter)
-      : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
-        Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
-        SmemAccessCounter(SmemAccessCounter) {}
-
-  unsigned getWaitCountMax(InstCounterType T) const {
-    switch (T) {
-    case LOAD_CNT:
-      return Limits.LoadcntMax;
-    case DS_CNT:
-      return Limits.DscntMax;
-    case EXP_CNT:
-      return Limits.ExpcntMax;
-    case STORE_CNT:
-      return Limits.StorecntMax;
-    case SAMPLE_CNT:
-      return Limits.SamplecntMax;
-    case BVH_CNT:
-      return Limits.BvhcntMax;
-    case KM_CNT:
-      return Limits.KmcntMax;
-    default:
-      break;
-    }
-    return 0;
-  }
-
-  unsigned getScoreLB(InstCounterType T) const {
-    assert(T < NUM_INST_CNTS);
-    return ScoreLBs[T];
-  }
-
-  unsigned getScoreUB(InstCounterType T) const {
-    assert(T < NUM_INST_CNTS);
-    return ScoreUBs[T];
-  }
-
-  unsigned getScoreRange(InstCounterType T) const {
-    return getScoreUB(T) - getScoreLB(T);
-  }
-
-  unsigned getRegScore(int GprNo, InstCounterType T) const {
-    if (GprNo < NUM_ALL_VGPRS) {
-      return VgprScores[T][GprNo];
-    }
-    assert(T == SmemAccessCounter);
-    return SgprScores[GprNo - NUM_ALL_VGPRS];
-  }
-
-  bool merge(const WaitcntBrackets &Other);
-
-  RegInterval getRegInterval(const MachineInstr *MI,
-                             const MachineRegisterInfo *MRI,
-                             const SIRegisterInfo *TRI, unsigned OpNo) const;
-
-  bool counterOutOfOrder(InstCounterType T) const;
-  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
-  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
-  void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
-  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
-  void applyWaitcnt(InstCounterType T, unsigned Count);
-  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
-                     const MachineRegisterInfo *MRI, WaitEventType E,
-                     MachineInstr &MI);
-
-  unsigned hasPendingEvent() const { return PendingEvents; }
-  unsigned hasPendingEvent(WaitEventType E) const {
-    return PendingEvents & (1 << E);
-  }
-  unsigned hasPendingEvent(InstCounterType T) const {
-    unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
-    assert((HasPending != 0) == (getScoreRange(T) != 0));
-    return HasPending;
-  }
-
-  bool hasMixedPendingEvents(InstCounterType T) const {
-    unsigned Events = hasPendingEvent(T);
-    // Return true if more than one bit is set in Events.
-    return Events & (Events - 1);
-  }
-
-  bool hasPendingFlat() const {
-    return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
-             LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
-            (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
-             LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
-  }
-
-  void setPendingFlat() {
-    LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
-    LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
-  }
-
-  // Return true if there might be pending writes to the specified vgpr by VMEM
-  // instructions with types different from V.
-  bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
-    assert(GprNo < NUM_ALL_VGPRS);
-    return VgprVmemTypes[GprNo] & ~(1 << V);
-  }
-
-  void clearVgprVmemTypes(int GprNo) {
-    assert(GprNo < NUM_ALL_VGPRS);
-    VgprVmemTypes[GprNo] = 0;
-  }
-
-  void setStateOnFunctionEntryOrReturn() {
-    setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
-    PendingEvents |= WaitEventMaskForInst[STORE_CNT];
-  }
-
-  ArrayRef<const MachineInstr *> getLDSDMAStores() const {
-    return LDSDMAStores;
+  SIWaitCntsInserter() {}
+  SIWaitCntsInserter(const GCNSubtarget *ST, const MachineRegisterInfo *MRI,
+                     WaitCntGenerator *WCG, InstCounterType MC, bool FEZWC,
+                     MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
+                     AliasAnalysis *AA)
+      : AMDGPUWaitCntInserter(ST, MRI, WCG, MC), MLI(MLI), PDT(PDT), AA(AA),
+        ForceEmitZeroWaitcnts(FEZWC) {
+    for (auto T : inst_counter_types())
+      ForceEmitWaitcnt[T] = false;
   }
-
-  void print(raw_ostream &);
-  void dump() { print(dbgs()); }
+  bool generateWaitcntInstBefore(MachineInstr &MI,
+                                 WaitcntBrackets &ScoreBrackets,
+                                 MachineInstr *OldWaitcntInstr, bool FlushVmCnt,
+                                 VGPRInstsSet *VGPRInsts) override;
+  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
+                            WaitcntBrackets &ScoreBrackets,
+                            VGPRInstsSet *VGPRInsts = nullptr) override;
+  void updateEventWaitcntAfter(MachineInstr &Inst,
+                               WaitcntBrackets *ScoreBrackets) override;
 
 private:
-  struct MergeInfo {
-    unsigned OldLB;
-    unsigned OtherLB;
-    unsigned MyShift;
-    unsigned OtherShift;
-  };
-  static bool mergeScore(const MergeInfo &M, unsigned &Score,
-                         unsigned OtherScore);
-
-  void setScoreLB(InstCounterType T, unsigned Val) {
-    assert(T < NUM_INST_CNTS);
-    ScoreLBs[T] = Val;
-  }
-
-  void setScoreUB(InstCounterType T, unsigned Val) {
-    assert(T < NUM_INST_CNTS);
-    ScoreUBs[T] = Val;
-
-    if (T != EXP_CNT)
-      return;
-
-    if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
-      ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
-  }
-
-  void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
-    if (GprNo < NUM_ALL_VGPRS) {
-      VgprUB = std::max(VgprUB, GprNo);
-      VgprScores[T][GprNo] = Val;
-    } else {
-      assert(T == SmemAccessCounter);
-      SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
-      SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
-    }
-  }
-
-  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
-                   const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
-                   unsigned OpNo, unsigned Val);
-
-  const GCNSubtarget *ST = nullptr;
-  InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
-  HardwareLimits Limits = {};
-  RegisterEncoding Encoding = {};
-  const unsigned *WaitEventMaskForInst;
-  InstCounterType SmemAccessCounter;
-  unsigned ScoreLBs[NUM_INST_CNTS] = {0};
-  unsigned ScoreUBs[NUM_INST_CNTS] = {0};
-  unsigned PendingEvents = 0;
-  // Remember the last flat memory operation.
-  unsigned LastFlat[NUM_INST_CNTS] = {0};
-  // wait_cnt scores for every vgpr.
-  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
-  int VgprUB = -1;
-  int SgprUB = -1;
-  unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
-  // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
-  // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
-  unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
-  // Bitmask of the VmemTypes of VMEM instructions that might have a pending
-  // write to each vgpr.
-  unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
-  // Store representative LDS DMA operations. The only useful info here is
-  // alias info. One store is kept per unique AAInfo.
-  SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
-};
-
-// This abstracts the logic for generating and updating S_WAIT* instructions
-// away from the analysis that determines where they are needed. This was
-// done because the set of counters and instructions for waiting on them
-// underwent a major shift with gfx12, sufficiently so that having this
-// abstraction allows the main analysis logic to be simpler than it would
-// otherwise have had to become.
-class WaitcntGenerator {
-protected:
-  const GCNSubtarget *ST = nullptr;
-  const SIInstrInfo *TII = nullptr;
-  AMDGPU::IsaVersion IV;
-  InstCounterType MaxCounter;
-
-public:
-  WaitcntGenerator() {}
-  WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
-      : ST(ST), TII(ST->getInstrInfo()),
-        IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {}
-
-  // Edits an existing sequence of wait count instructions according
-  // to an incoming Waitcnt value, which is itself updated to reflect
-  // any new wait count instructions which may need to be generated by
-  // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
-  // were made.
-  //
-  // This editing will usually be merely updated operands, but it may also
-  // delete instructions if the incoming Wait value indicates they are not
-  // needed. It may also remove existing instructions for which a wait
-  // is needed if it can be determined that it is better to generate new
-  // instructions later, as can happen on gfx12.
-  virtual bool
-  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
-                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
-                          MachineBasicBlock::instr_iterator It) const = 0;
-
-  // Transform a soft waitcnt into a normal one.
-  bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
-
-  // Generates new wait count instructions according to the  value of
-  // Wait, returning true if any new instructions were created.
-  virtual bool createNewWaitcnt(MachineBasicBlock &Block,
-                                MachineBasicBlock::instr_iterator It,
-                                AMDGPU::Waitcnt Wait) = 0;
-
-  // Returns an array of bit masks which can be used to map values in
-  // WaitEventType to corresponding counter values in InstCounterType.
-  virtual const unsigned *getWaitEventMask() const = 0;
-
-  // Returns a new waitcnt with all counters except VScnt set to 0. If
-  // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
-  virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
-
-  virtual ~WaitcntGenerator() = default;
-};
-
-class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
-public:
-  WaitcntGeneratorPreGFX12() {}
-  WaitcntGeneratorPreGFX12(const GCNSubtarget *ST)
-      : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {}
-
-  bool
-  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
-                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
-                          MachineBasicBlock::instr_iterator It) const override;
-
-  bool createNewWaitcnt(MachineBasicBlock &Block,
-                        MachineBasicBlock::instr_iterator It,
-                        AMDGPU::Waitcnt Wait) override;
-
-  const unsigned *getWaitEventMask() const override {
-    assert(ST);
-
-    static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
-        (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS) |
-            (1 << VMEM_SAMPLER_READ_ACCESS) | (1 << VMEM_BVH_READ_ACCESS),
-        (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
-            (1 << SQ_MESSAGE),
-        (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
-            (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
-            (1 << EXP_LDS_ACCESS),
-        (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
-        0,
-        0,
-        0};
-
-    return WaitEventMaskForInstPreGFX12;
-  }
-
-  virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
-};
-
-class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
-public:
-  WaitcntGeneratorGFX12Plus() {}
-  WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter)
-      : WaitcntGenerator(ST, MaxCounter) {}
-
-  bool
-  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
-                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
-                          MachineBasicBlock::instr_iterator It) const override;
-
-  bool createNewWaitcnt(MachineBasicBlock &Block,
-                        MachineBasicBlock::instr_iterator It,
-                        AMDGPU::Waitcnt Wait) override;
-
-  const unsigned *getWaitEventMask() const override {
-    assert(ST);
-
-    static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
-        (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
-        (1 << LDS_ACCESS) | (1 << GDS_ACCESS),
-        (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
-            (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
-            (1 << EXP_LDS_ACCESS),
-        (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
-        (1 << VMEM_SAMPLER_READ_ACCESS),
-        (1 << VMEM_BVH_READ_ACCESS),
-        (1 << SMEM_ACCESS) | (1 << SQ_MESSAGE)};
-
-    return WaitEventMaskForInstGFX12Plus;
-  }
-
-  virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
-};
-
-class SIInsertWaitcnts : public MachineFunctionPass {
-private:
-  const GCNSubtarget *ST = nullptr;
-  const SIInstrInfo *TII = nullptr;
-  const SIRegisterInfo *TRI = nullptr;
-  const MachineRegisterInfo *MRI = nullptr;
-
-  DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
-  DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
   MachineLoopInfo *MLI;
   MachinePostDominatorTree *PDT;
   AliasAnalysis *AA = nullptr;
 
-  struct BlockInfo {
-    std::unique_ptr<WaitcntBrackets> Incoming;
-    bool Dirty = true;
-  };
-
-  InstCounterType SmemAccessCounter;
+  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
+  bool isPreheaderToFlush(MachineBasicBlock &MBB,
+                          WaitcntBrackets &ScoreBrackets);
+  bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets) const;
+  WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const;
+  void setForceEmitWaitcnt();
 
-  MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
+  DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
+  DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
 
   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
   // because of amdgpu-waitcnt-forcezero flag
   bool ForceEmitZeroWaitcnts;
   bool ForceEmitWaitcnt[NUM_INST_CNTS];
+};
 
-  bool OptNone;
-
-  // In any given run of this pass, WCG will point to one of these two
-  // generator objects, which must have been re-initialised before use
-  // from a value made using a subtarget constructor.
-  WaitcntGeneratorPreGFX12 WCGPreGFX12;
-  WaitcntGeneratorGFX12Plus WCGGFX12Plus;
-
-  WaitcntGenerator *WCG = nullptr;
+// This is a flat memory operation. Check to see if it has memory tokens for
+// either LDS or FLAT.
+bool SIWaitCntsInserter::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
+  assert(TII->isFLAT(MI));
 
-  // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
-  // message.
-  DenseSet<MachineInstr *> ReleaseVGPRInsts;
+  // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
+  if (!TII->usesLGKM_CNT(MI))
+    return false;
 
-  InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
+  // If in tgsplit mode then there can be no use of LDS.
+  if (ST->isTgSplitEnabled())
+    return false;
 
-public:
-  static char ID;
+  // If there are no memory operands then conservatively assume the flat
+  // operation may access LDS.
+  if (MI.memoperands_empty())
+    return true;
 
-  SIInsertWaitcnts() : MachineFunctionPass(ID) {
-    (void)ForceExpCounter;
-    (void)ForceLgkmCounter;
-    (void)ForceVMCounter;
+  // See if any memory operand specifies an address space that involves LDS.
+  for (const MachineMemOperand *Memop : MI.memoperands()) {
+    unsigned AS = Memop->getAddrSpace();
+    if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
+      return true;
   }
 
-  bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
-  bool isPreheaderToFlush(MachineBasicBlock &MBB,
-                          WaitcntBrackets &ScoreBrackets);
-  bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  StringRef getPassName() const override {
-    return "SI insert wait instructions";
-  }
+  return false;
+}
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<MachineLoopInfo>();
-    AU.addRequired<MachinePostDominatorTree>();
-    AU.addUsedIfAvailable<AAResultsWrapperPass>();
-    AU.addPreserved<AAResultsWrapperPass>();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
+// Return true if the given machine basic block is a preheader of a loop in
+// which we want to flush the vmcnt counter, and false otherwise.
+bool SIWaitCntsInserter::isPreheaderToFlush(MachineBasicBlock &MBB,
+                                            WaitcntBrackets &ScoreBrackets) {
+  auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
+  if (!IsInserted)
+    return Iterator->second;
 
-  bool isForceEmitWaitcnt() const {
-    for (auto T : inst_counter_types())
-      if (ForceEmitWaitcnt[T])
-        return true;
+  MachineBasicBlock *Succ = MBB.getSingleSuccessor();
+  if (!Succ)
     return false;
-  }
-
-  void setForceEmitWaitcnt() {
-// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
-// For debug builds, get the debug counter info and adjust if need be
-#ifndef NDEBUG
-    if (DebugCounter::isCounterSet(ForceExpCounter) &&
-        DebugCounter::shouldExecute(ForceExpCounter)) {
-      ForceEmitWaitcnt[EXP_CNT] = true;
-    } else {
-      ForceEmitWaitcnt[EXP_CNT] = false;
-    }
-
-    if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
-        DebugCounter::shouldExecute(ForceLgkmCounter)) {
-      ForceEmitWaitcnt[DS_CNT] = true;
-      ForceEmitWaitcnt[KM_CNT] = true;
-    } else {
-      ForceEmitWaitcnt[DS_CNT] = false;
-      ForceEmitWaitcnt[KM_CNT] = false;
-    }
 
-    if (DebugCounter::isCounterSet(ForceVMCounter) &&
-        DebugCounter::shouldExecute(ForceVMCounter)) {
-      ForceEmitWaitcnt[LOAD_CNT] = true;
-      ForceEmitWaitcnt[SAMPLE_CNT] = true;
-      ForceEmitWaitcnt[BVH_CNT] = true;
-    } else {
-      ForceEmitWaitcnt[LOAD_CNT] = false;
-      ForceEmitWaitcnt[SAMPLE_CNT] = false;
-      ForceEmitWaitcnt[BVH_CNT] = false;
-    }
-#endif // NDEBUG
-  }
-
-  // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
-  // FLAT instruction.
-  WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
-    // Maps VMEM access types to their corresponding WaitEventType.
-    static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
-        VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
-
-    assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
-    // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
-    // these should use VM_CNT.
-    if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
-      return VMEM_ACCESS;
-    if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
-      // FLAT and SCRATCH instructions may access scratch. Other VMEM
-      // instructions do not.
-      if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
-        return SCRATCH_WRITE_ACCESS;
-      return VMEM_WRITE_ACCESS;
-    }
-    if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
-      return VMEM_READ_ACCESS;
-    return VmemReadMapping[getVmemType(Inst)];
-  }
-
-  bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
-  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
-  bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
-  bool generateWaitcntInstBefore(MachineInstr &MI,
-                                 WaitcntBrackets &ScoreBrackets,
-                                 MachineInstr *OldWaitcntInstr,
-                                 bool FlushVmCnt);
-  bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
-                               WaitcntBrackets &ScoreBrackets,
-                               MachineInstr *OldWaitcntInstr);
-  bool generateWaitcnt(AMDGPU::Waitcnt Wait,
-                       MachineBasicBlock::instr_iterator It,
-                       MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
-                       MachineInstr *OldWaitcntInstr);
-  void updateEventWaitcntAfter(MachineInstr &Inst,
-                               WaitcntBrackets *ScoreBrackets);
-  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
-                            WaitcntBrackets &ScoreBrackets);
-};
+  MachineLoop *Loop = MLI->getLoopFor(Succ);
+  if (!Loop)
+    return false;
 
-} // end anonymous namespace
-
-RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
-                                            const MachineRegisterInfo *MRI,
-                                            const SIRegisterInfo *TRI,
-                                            unsigned OpNo) const {
-  const MachineOperand &Op = MI->getOperand(OpNo);
-  if (!TRI->isInAllocatableClass(Op.getReg()))
-    return {-1, -1};
-
-  // A use via a PW operand does not need a waitcnt.
-  // A partial write is not a WAW.
-  assert(!Op.getSubReg() || !Op.isUndef());
-
-  RegInterval Result;
-
-  unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
-                 AMDGPU::HWEncoding::REG_IDX_MASK;
-
-  if (TRI->isVectorRegister(*MRI, Op.getReg())) {
-    assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
-    Result.first = Reg - Encoding.VGPR0;
-    if (TRI->isAGPR(*MRI, Op.getReg()))
-      Result.first += AGPR_OFFSET;
-    assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
-  } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
-    assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
-    Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
-    assert(Result.first >= NUM_ALL_VGPRS &&
-           Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
+  if (Loop->getLoopPreheader() == &MBB &&
+      shouldFlushVmCnt(Loop, ScoreBrackets)) {
+    Iterator->second = true;
+    return true;
   }
-  // TODO: Handle TTMP
-  // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
-  else
-    return {-1, -1};
-
-  const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
-  unsigned Size = TRI->getRegSizeInBits(*RC);
-  Result.second = Result.first + ((Size + 16) / 32);
 
-  return Result;
-}
-
-void WaitcntBrackets::setExpScore(const MachineInstr *MI,
-                                  const SIInstrInfo *TII,
-                                  const SIRegisterInfo *TRI,
-                                  const MachineRegisterInfo *MRI, unsigned OpNo,
-                                  unsigned Val) {
-  RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo);
-  assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
-  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-    setRegScore(RegNo, EXP_CNT, Val);
-  }
+  return false;
 }
 
-void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
-                                    const SIRegisterInfo *TRI,
-                                    const MachineRegisterInfo *MRI,
-                                    WaitEventType E, MachineInstr &Inst) {
-  InstCounterType T = eventCounter(WaitEventMaskForInst, E);
-
-  unsigned UB = getScoreUB(T);
-  unsigned CurrScore = UB + 1;
-  if (CurrScore == 0)
-    report_fatal_error("InsertWaitcnt score wraparound");
-  // PendingEvents and ScoreUB need to be update regardless if this event
-  // changes the score of a register or not.
-  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
-  PendingEvents |= 1 << E;
-  setScoreUB(T, CurrScore);
-
-  if (T == EXP_CNT) {
-    // Put score on the source vgprs. If this is a store, just use those
-    // specific register(s).
-    if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
-      int AddrOpIdx =
-          AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
-      // All GDS operations must protect their address register (same as
-      // export.)
-      if (AddrOpIdx != -1) {
-        setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
-      }
+// Return true if it is better to flush the vmcnt counter in the preheader of
+// the given loop. We currently decide to flush in two situations:
+// 1. The loop contains vmem store(s), no vmem load and at least one use of a
+//    vgpr containing a value that is loaded outside of the loop. (Only on
+//    targets with no vscnt counter).
+// 2. The loop contains vmem load(s), but the loaded values are not used in the
+//    loop, and at least one use of a vgpr containing a value that is loaded
+//    outside of the loop.
+bool SIWaitCntsInserter::shouldFlushVmCnt(MachineLoop *ML,
+                                          WaitcntBrackets &Brackets) const {
+  bool HasVMemLoad = false;
+  bool HasVMemStore = false;
+  bool UsesVgprLoadedOutside = false;
+  DenseSet<Register> VgprUse;
+  DenseSet<Register> VgprDef;
 
-      if (Inst.mayStore()) {
-        if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
-          setExpScore(
-              &Inst, TII, TRI, MRI,
-              AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
-              CurrScore);
-        }
-        if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
-          setExpScore(&Inst, TII, TRI, MRI,
-                      AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
-                                                 AMDGPU::OpName::data1),
-                      CurrScore);
-        }
-      } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
-                 Inst.getOpcode() != AMDGPU::DS_APPEND &&
-                 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
-                 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
-        for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
-          const MachineOperand &Op = Inst.getOperand(I);
-          if (Op.isReg() && !Op.isDef() &&
-              TRI->isVectorRegister(*MRI, Op.getReg())) {
-            setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
-          }
-        }
-      }
-    } else if (TII->isFLAT(Inst)) {
-      if (Inst.mayStore()) {
-        setExpScore(
-            &Inst, TII, TRI, MRI,
-            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
-            CurrScore);
-      } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setExpScore(
-            &Inst, TII, TRI, MRI,
-            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
-            CurrScore);
-      }
-    } else if (TII->isMIMG(Inst)) {
-      if (Inst.mayStore()) {
-        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
-      } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setExpScore(
-            &Inst, TII, TRI, MRI,
-            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
-            CurrScore);
-      }
-    } else if (TII->isMTBUF(Inst)) {
-      if (Inst.mayStore()) {
-        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
-      }
-    } else if (TII->isMUBUF(Inst)) {
-      if (Inst.mayStore()) {
-        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
-      } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setExpScore(
-            &Inst, TII, TRI, MRI,
-            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
-            CurrScore);
-      }
-    } else if (TII->isLDSDIR(Inst)) {
-      // LDSDIR instructions attach the score to the destination.
-      setExpScore(
-          &Inst, TII, TRI, MRI,
-          AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
-          CurrScore);
-    } else {
-      if (TII->isEXP(Inst)) {
-        // For export the destination registers are really temps that
-        // can be used as the actual source after export patching, so
-        // we need to treat them like sources and set the EXP_CNT
-        // score.
-        for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
-          MachineOperand &DefMO = Inst.getOperand(I);
-          if (DefMO.isReg() && DefMO.isDef() &&
-              TRI->isVGPR(*MRI, DefMO.getReg())) {
-            setRegScore(
-                TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
-                EXP_CNT, CurrScore);
-          }
-        }
-      }
-      for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
-        MachineOperand &MO = Inst.getOperand(I);
-        if (MO.isReg() && !MO.isDef() &&
-            TRI->isVectorRegister(*MRI, MO.getReg())) {
-          setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
-        }
-      }
-    }
-#if 0 // TODO: check if this is handled by MUBUF code above.
-  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
-       Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
-       Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
-    MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
-    unsigned OpNo;//TODO: find the OpNo for this operand;
-    RegInterval Interval = getRegInterval(&Inst, MRI, TRI, OpNo);
-    for (int RegNo = Interval.first; RegNo < Interval.second;
-    ++RegNo) {
-      setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
-    }
-#endif
-  } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
-    // Match the score to the destination registers.
-    for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
-      auto &Op = Inst.getOperand(I);
-      if (!Op.isReg() || !Op.isDef())
-        continue;
-      RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I);
-      if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
-        if (Interval.first >= NUM_ALL_VGPRS)
-          continue;
-        if (updateVMCntOnly(Inst)) {
-          // updateVMCntOnly should only leave us with VGPRs
-          // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
-          // defs. That's required for a sane index into `VgprMemTypes` below
-          assert(TRI->isVectorRegister(*MRI, Op.getReg()));
-          VmemType V = getVmemType(Inst);
-          for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
-            VgprVmemTypes[RegNo] |= 1 << V;
-        }
-      }
-      for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-        setRegScore(RegNo, T, CurrScore);
+  for (MachineBasicBlock *MBB : ML->blocks()) {
+    for (MachineInstr &MI : *MBB) {
+      if (isVMEMOrFlatVMEM(MI)) {
+        if (MI.mayLoad())
+          HasVMemLoad = true;
+        if (MI.mayStore())
+          HasVMemStore = true;
       }
-    }
-    if (Inst.mayStore() &&
-        (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
-      // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
-      // written can be accessed. A load from LDS to VMEM does not need a wait.
-      unsigned Slot = 0;
-      for (const auto *MemOp : Inst.memoperands()) {
-        if (!MemOp->isStore() ||
-            MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
+      for (unsigned I = 0; I < MI.getNumOperands(); I++) {
+        MachineOperand &Op = MI.getOperand(I);
+        if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
           continue;
-        // Comparing just AA info does not guarantee memoperands are equal
-        // in general, but this is so for LDS DMA in practice.
-        auto AAI = MemOp->getAAInfo();
-        // Alias scope information gives a way to definitely identify an
-        // original memory object and practically produced in the module LDS
-        // lowering pass. If there is no scope available we will not be able
-        // to disambiguate LDS aliasing as after the module lowering all LDS
-        // is squashed into a single big object. Do not attempt to use one of
-        // the limited LDSDMAStores for something we will not be able to use
-        // anyway.
-        if (!AAI || !AAI.Scope)
-          break;
-        for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
-          for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
-            if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
-              Slot = I + 1;
+        auto [RegLow, RegHigh] = Brackets.getRegInterval(&MI, MRI, TRI, I);
+        // Vgpr use
+        if (Op.isUse()) {
+          for (int RegNo = RegLow; RegNo < RegHigh; ++RegNo) {
+            // If we find a register that is loaded inside the loop, 1. and 2.
+            // are invalidated and we can exit.
+            if (VgprDef.contains(RegNo))
+              return false;
+            VgprUse.insert(RegNo);
+            // If at least one of Op's registers is in the score brackets, the
+            // value is likely loaded outside of the loop.
+            if (Brackets.getRegScore(RegNo, LOAD_CNT) >
+                    Brackets.getScoreLB(LOAD_CNT) ||
+                Brackets.getRegScore(RegNo, SAMPLE_CNT) >
+                    Brackets.getScoreLB(SAMPLE_CNT) ||
+                Brackets.getRegScore(RegNo, BVH_CNT) >
+                    Brackets.getScoreLB(BVH_CNT)) {
+              UsesVgprLoadedOutside = true;
               break;
             }
           }
-        }
-        if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
-          break;
-        LDSDMAStores.push_back(&Inst);
-        Slot = LDSDMAStores.size();
-        break;
-      }
-      setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
-      if (Slot)
-        setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
-    }
-  }
-}
-
-void WaitcntBrackets::print(raw_ostream &OS) {
-  OS << '\n';
-  for (auto T : inst_counter_types(MaxCounter)) {
-    unsigned SR = getScoreRange(T);
-
-    switch (T) {
-    case LOAD_CNT:
-      OS << "    " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
-         << SR << "): ";
-      break;
-    case DS_CNT:
-      OS << "    " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
-         << SR << "): ";
-      break;
-    case EXP_CNT:
-      OS << "    EXP_CNT(" << SR << "): ";
-      break;
-    case STORE_CNT:
-      OS << "    " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
-         << SR << "): ";
-      break;
-    case SAMPLE_CNT:
-      OS << "    SAMPLE_CNT(" << SR << "): ";
-      break;
-    case BVH_CNT:
-      OS << "    BVH_CNT(" << SR << "): ";
-      break;
-    case KM_CNT:
-      OS << "    KM_CNT(" << SR << "): ";
-      break;
-    default:
-      OS << "    UNKNOWN(" << SR << "): ";
-      break;
-    }
-
-    if (SR != 0) {
-      // Print vgpr scores.
-      unsigned LB = getScoreLB(T);
-
-      for (int J = 0; J <= VgprUB; J++) {
-        unsigned RegScore = getRegScore(J, T);
-        if (RegScore <= LB)
-          continue;
-        unsigned RelScore = RegScore - LB - 1;
-        if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
-          OS << RelScore << ":v" << J << " ";
-        } else {
-          OS << RelScore << ":ds ";
-        }
-      }
-      // Also need to print sgpr scores for lgkm_cnt.
-      if (T == SmemAccessCounter) {
-        for (int J = 0; J <= SgprUB; J++) {
-          unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
-          if (RegScore <= LB)
-            continue;
-          unsigned RelScore = RegScore - LB - 1;
-          OS << RelScore << ":s" << J << " ";
-        }
-      }
-    }
-    OS << '\n';
-  }
-  OS << '\n';
-}
-
-/// Simplify the waitcnt, in the sense of removing redundant counts, and return
-/// whether a waitcnt instruction is needed at all.
-void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
-  simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
-  simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
-  simplifyWaitcnt(DS_CNT, Wait.DsCnt);
-  simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
-  simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
-  simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
-  simplifyWaitcnt(KM_CNT, Wait.KmCnt);
-}
-
-void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
-                                      unsigned &Count) const {
-  // The number of outstanding events for this type, T, can be calculated
-  // as (UB - LB). If the current Count is greater than or equal to the number
-  // of outstanding events, then the wait for this counter is redundant.
-  if (Count >= getScoreRange(T))
-    Count = ~0u;
-}
-
-void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
-                                    AMDGPU::Waitcnt &Wait) const {
-  unsigned ScoreToWait = getRegScore(RegNo, T);
-
-  // If the score of src_operand falls within the bracket, we need an
-  // s_waitcnt instruction.
-  const unsigned LB = getScoreLB(T);
-  const unsigned UB = getScoreUB(T);
-  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
-    if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
-        !ST->hasFlatLgkmVMemCountInOrder()) {
-      // If there is a pending FLAT operation, and this is a VMem or LGKM
-      // waitcnt and the target can report early completion, then we need
-      // to force a waitcnt 0.
-      addWait(Wait, T, 0);
-    } else if (counterOutOfOrder(T)) {
-      // Counter can get decremented out-of-order when there
-      // are multiple types event in the bracket. Also emit an s_wait counter
-      // with a conservative value of 0 for the counter.
-      addWait(Wait, T, 0);
-    } else {
-      // If a counter has been maxed out avoid overflow by waiting for
-      // MAX(CounterType) - 1 instead.
-      unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
-      addWait(Wait, T, NeededWait);
-    }
-  }
-}
-
-void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
-  applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
-  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
-  applyWaitcnt(DS_CNT, Wait.DsCnt);
-  applyWaitcnt(STORE_CNT, Wait.StoreCnt);
-  applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
-  applyWaitcnt(BVH_CNT, Wait.BvhCnt);
-  applyWaitcnt(KM_CNT, Wait.KmCnt);
-}
-
-void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
-  const unsigned UB = getScoreUB(T);
-  if (Count >= UB)
-    return;
-  if (Count != 0) {
-    if (counterOutOfOrder(T))
-      return;
-    setScoreLB(T, std::max(getScoreLB(T), UB - Count));
-  } else {
-    setScoreLB(T, UB);
-    PendingEvents &= ~WaitEventMaskForInst[T];
-  }
-}
-
-// Where there are multiple types of event in the bracket of a counter,
-// the decrement may go out of order.
-bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
-  // Scalar memory read always can go out of order.
-  if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
-    return true;
-  return hasMixedPendingEvents(T);
-}
-
-INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
-                      false)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
-INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
-                    false)
-
-char SIInsertWaitcnts::ID = 0;
-
-char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
-
-FunctionPass *llvm::createSIInsertWaitcntsPass() {
-  return new SIInsertWaitcnts();
-}
-
-static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
-                                     unsigned NewEnc) {
-  int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
-  assert(OpIdx >= 0);
-
-  MachineOperand &MO = MI.getOperand(OpIdx);
-
-  if (NewEnc == MO.getImm())
-    return false;
-
-  MO.setImm(NewEnc);
-  return true;
-}
-
-/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
-/// and if so, which counter it is waiting on.
-static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
-  switch (Opcode) {
-  case AMDGPU::S_WAIT_LOADCNT:
-    return LOAD_CNT;
-  case AMDGPU::S_WAIT_EXPCNT:
-    return EXP_CNT;
-  case AMDGPU::S_WAIT_STORECNT:
-    return STORE_CNT;
-  case AMDGPU::S_WAIT_SAMPLECNT:
-    return SAMPLE_CNT;
-  case AMDGPU::S_WAIT_BVHCNT:
-    return BVH_CNT;
-  case AMDGPU::S_WAIT_DSCNT:
-    return DS_CNT;
-  case AMDGPU::S_WAIT_KMCNT:
-    return KM_CNT;
-  default:
-    return {};
-  }
-}
-
-bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
-  unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
-  if (Opcode == Waitcnt->getOpcode())
-    return false;
-
-  Waitcnt->setDesc(TII->get(Opcode));
-  return true;
-}
-
-/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
-/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
-/// from \p Wait that were added by previous passes. Currently this pass
-/// conservatively assumes that these preexisting waits are required for
-/// correctness.
-bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
-    WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
-    AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
-  assert(ST);
-  assert(isNormalMode(MaxCounter));
-
-  bool Modified = false;
-  MachineInstr *WaitcntInstr = nullptr;
-  MachineInstr *WaitcntVsCntInstr = nullptr;
-
-  for (auto &II :
-       make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
-    if (II.isMetaInstruction())
-      continue;
-
-    unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
-    bool IsSoft = Opcode != II.getOpcode();
-
-    // Update required wait count. If this is a soft waitcnt (= it was added
-    // by an earlier pass), it may be entirely removed.
-    if (Opcode == AMDGPU::S_WAITCNT) {
-      unsigned IEnc = II.getOperand(0).getImm();
-      AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
-      if (IsSoft)
-        ScoreBrackets.simplifyWaitcnt(OldWait);
-      Wait = Wait.combined(OldWait);
-
-      // Merge consecutive waitcnt of the same type by erasing multiples.
-      if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) {
-        II.eraseFromParent();
-        Modified = true;
-      } else
-        WaitcntInstr = &II;
-    } else {
-      assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
-      assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
-
-      unsigned OldVSCnt =
-          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
-      if (IsSoft)
-        ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
-      Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
-
-      if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) {
-        II.eraseFromParent();
-        Modified = true;
-      } else
-        WaitcntVsCntInstr = &II;
-    }
-  }
-
-  if (WaitcntInstr) {
-    Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
-                                         AMDGPU::encodeWaitcnt(IV, Wait));
-    Modified |= promoteSoftWaitCnt(WaitcntInstr);
-
-    ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
-    ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
-    ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
-    Wait.LoadCnt = ~0u;
-    Wait.ExpCnt = ~0u;
-    Wait.DsCnt = ~0u;
-
-    LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
-                   ? dbgs()
-                         << "applyPreexistingWaitcnt\n"
-                         << "New Instr at block end: " << *WaitcntInstr << '\n'
-                   : dbgs() << "applyPreexistingWaitcnt\n"
-                            << "Old Instr: " << *It
-                            << "New Instr: " << *WaitcntInstr << '\n');
-  }
-
-  if (WaitcntVsCntInstr) {
-    Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
-                                         AMDGPU::OpName::simm16, Wait.StoreCnt);
-    Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
-
-    ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
-    Wait.StoreCnt = ~0u;
-
-    LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
-                   ? dbgs() << "applyPreexistingWaitcnt\n"
-                            << "New Instr at block end: " << *WaitcntVsCntInstr
-                            << '\n'
-                   : dbgs() << "applyPreexistingWaitcnt\n"
-                            << "Old Instr: " << *It
-                            << "New Instr: " << *WaitcntVsCntInstr << '\n');
-  }
-
-  return Modified;
-}
-
-/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
-/// required counters in \p Wait
-bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
-    MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
-    AMDGPU::Waitcnt Wait) {
-  assert(ST);
-  assert(isNormalMode(MaxCounter));
-
-  bool Modified = false;
-  const DebugLoc &DL = Block.findDebugLoc(It);
-
-  // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
-  // single instruction while VScnt has its own instruction.
-  if (Wait.hasWaitExceptStoreCnt()) {
-    unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
-    [[maybe_unused]] auto SWaitInst =
-        BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
-    Modified = true;
-
-    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
-               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
-               dbgs() << "New Instr: " << *SWaitInst << '\n');
-  }
-
-  if (Wait.hasWaitStoreCnt()) {
-    assert(ST->hasVscnt());
-
-    [[maybe_unused]] auto SWaitInst =
-        BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
-            .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
-            .addImm(Wait.StoreCnt);
-    Modified = true;
-
-    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
-               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
-               dbgs() << "New Instr: " << *SWaitInst << '\n');
-  }
-
-  return Modified;
-}
-
-AMDGPU::Waitcnt
-WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
-  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
-}
-
-AMDGPU::Waitcnt
-WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
-  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
-}
-
-/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
-/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
-/// were added by previous passes. Currently this pass conservatively
-/// assumes that these preexisting waits are required for correctness.
-bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
-    WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
-    AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
-  assert(ST);
-  assert(!isNormalMode(MaxCounter));
-
-  bool Modified = false;
-  MachineInstr *CombinedLoadDsCntInstr = nullptr;
-  MachineInstr *CombinedStoreDsCntInstr = nullptr;
-  MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
-
-  for (auto &II :
-       make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
-    if (II.isMetaInstruction())
-      continue;
-
-    MachineInstr **UpdatableInstr;
-
-    // Update required wait count. If this is a soft waitcnt (= it was added
-    // by an earlier pass), it may be entirely removed.
-
-    unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
-    bool IsSoft = Opcode != II.getOpcode();
-
-    if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
-      unsigned OldEnc =
-          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
-      AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
-      if (IsSoft)
-        ScoreBrackets.simplifyWaitcnt(OldWait);
-      Wait = Wait.combined(OldWait);
-      UpdatableInstr = &CombinedLoadDsCntInstr;
-    } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
-      unsigned OldEnc =
-          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
-      AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
-      if (IsSoft)
-        ScoreBrackets.simplifyWaitcnt(OldWait);
-      Wait = Wait.combined(OldWait);
-      UpdatableInstr = &CombinedStoreDsCntInstr;
-    } else {
-      std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
-      assert(CT.has_value());
-      unsigned OldCnt =
-          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
-      if (IsSoft)
-        ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
-      addWait(Wait, CT.value(), OldCnt);
-      UpdatableInstr = &WaitInstrs[CT.value()];
-    }
-
-    // Merge consecutive waitcnt of the same type by erasing multiples.
-    if (!*UpdatableInstr) {
-      *UpdatableInstr = &II;
-    } else {
-      II.eraseFromParent();
-      Modified = true;
-    }
-  }
-
-  if (CombinedLoadDsCntInstr) {
-    // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
-    // to be waited for. Otherwise, let the instruction be deleted so
-    // the appropriate single counter wait instruction can be inserted
-    // instead, when new S_WAIT_*CNT instructions are inserted by
-    // createNewWaitcnt(). As a side effect, resetting the wait counts will
-    // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
-    // the loop below that deals with single counter instructions.
-    if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
-      unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
-      Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
-                                           AMDGPU::OpName::simm16, NewEnc);
-      Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
-      ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
-      ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
-      Wait.LoadCnt = ~0u;
-      Wait.DsCnt = ~0u;
-
-      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
-                     ? dbgs() << "applyPreexistingWaitcnt\n"
-                              << "New Instr at block end: "
-                              << *CombinedLoadDsCntInstr << '\n'
-                     : dbgs() << "applyPreexistingWaitcnt\n"
-                              << "Old Instr: " << *It << "New Instr: "
-                              << *CombinedLoadDsCntInstr << '\n');
-    } else {
-      CombinedLoadDsCntInstr->eraseFromParent();
-      Modified = true;
-    }
-  }
-
-  if (CombinedStoreDsCntInstr) {
-    // Similarly for S_WAIT_STORECNT_DSCNT.
-    if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
-      unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
-      Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
-                                           AMDGPU::OpName::simm16, NewEnc);
-      Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
-      ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
-      ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
-      Wait.StoreCnt = ~0u;
-      Wait.DsCnt = ~0u;
-
-      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
-                     ? dbgs() << "applyPreexistingWaitcnt\n"
-                              << "New Instr at block end: "
-                              << *CombinedStoreDsCntInstr << '\n'
-                     : dbgs() << "applyPreexistingWaitcnt\n"
-                              << "Old Instr: " << *It << "New Instr: "
-                              << *CombinedStoreDsCntInstr << '\n');
-    } else {
-      CombinedStoreDsCntInstr->eraseFromParent();
-      Modified = true;
-    }
-  }
-
-  // Look for an opportunity to convert existing S_WAIT_LOADCNT,
-  // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
-  // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
-  // instructions so that createNewWaitcnt() will create new combined
-  // instructions to replace them.
-
-  if (Wait.DsCnt != ~0u) {
-    // This is a vector of addresses in WaitInstrs pointing to instructions
-    // that should be removed if they are present.
-    SmallVector<MachineInstr **, 2> WaitsToErase;
-
-    // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
-    // both) need to be waited for, ensure that there are no existing
-    // individual wait count instructions for these.
-
-    if (Wait.LoadCnt != ~0u) {
-      WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
-      WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
-    } else if (Wait.StoreCnt != ~0u) {
-      WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
-      WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
-    }
-
-    for (MachineInstr **WI : WaitsToErase) {
-      if (!*WI)
-        continue;
-
-      (*WI)->eraseFromParent();
-      *WI = nullptr;
-      Modified = true;
-    }
-  }
-
-  for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
-    if (!WaitInstrs[CT])
-      continue;
-
-    unsigned NewCnt = getWait(Wait, CT);
-    if (NewCnt != ~0u) {
-      Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
-                                           AMDGPU::OpName::simm16, NewCnt);
-      Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
-
-      ScoreBrackets.applyWaitcnt(CT, NewCnt);
-      setNoWait(Wait, CT);
-
-      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
-                     ? dbgs() << "applyPreexistingWaitcnt\n"
-                              << "New Instr at block end: " << *WaitInstrs[CT]
-                              << '\n'
-                     : dbgs() << "applyPreexistingWaitcnt\n"
-                              << "Old Instr: " << *It
-                              << "New Instr: " << *WaitInstrs[CT] << '\n');
-    } else {
-      WaitInstrs[CT]->eraseFromParent();
-      Modified = true;
+        }
+        // VMem load vgpr def
+        else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
+          for (int RegNo = RegLow; RegNo < RegHigh; ++RegNo) {
+            // If we find a register that is loaded inside the loop, 1. and 2.
+            // are invalidated and we can exit.
+            if (VgprUse.contains(RegNo))
+              return false;
+            VgprDef.insert(RegNo);
+          }
+      }
     }
   }
-
-  return Modified;
+  if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
+    return true;
+  return HasVMemLoad && UsesVgprLoadedOutside;
 }
 
-/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
-bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
-    MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
-    AMDGPU::Waitcnt Wait) {
-  assert(ST);
-  assert(!isNormalMode(MaxCounter));
-
-  bool Modified = false;
-  const DebugLoc &DL = Block.findDebugLoc(It);
-
-  // Check for opportunities to use combined wait instructions.
-  if (Wait.DsCnt != ~0u) {
-    MachineInstr *SWaitInst = nullptr;
-
-    if (Wait.LoadCnt != ~0u) {
-      unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
-
-      SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
-                      .addImm(Enc);
-
-      Wait.LoadCnt = ~0u;
-      Wait.DsCnt = ~0u;
-    } else if (Wait.StoreCnt != ~0u) {
-      unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
-
-      SWaitInst =
-          BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
-              .addImm(Enc);
-
-      Wait.StoreCnt = ~0u;
-      Wait.DsCnt = ~0u;
-    }
-
-    if (SWaitInst) {
-      Modified = true;
-
-      LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
-                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
-                 dbgs() << "New Instr: " << *SWaitInst << '\n');
-    }
+// Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
+// FLAT instruction.
+WaitEventType
+SIWaitCntsInserter::getVmemWaitEventType(const MachineInstr &Inst) const {
+  // Maps VMEM access types to their corresponding WaitEventType.
+  static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
+      VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
+
+  assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
+  // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
+  // these should use VM_CNT.
+  if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
+    return VMEM_ACCESS;
+  if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
+    // FLAT and SCRATCH instructions may access scratch. Other VMEM
+    // instructions do not.
+    if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
+      return SCRATCH_WRITE_ACCESS;
+    return VMEM_WRITE_ACCESS;
+  }
+  if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
+    return VMEM_READ_ACCESS;
+  return VmemReadMapping[getVmemType(Inst)];
+}
+
+void SIWaitCntsInserter::setForceEmitWaitcnt() {
+// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
+// For debug builds, get the debug counter info and adjust if need be
+#ifndef NDEBUG
+  if (DebugCounter::isCounterSet(ForceExpCounter) &&
+      DebugCounter::shouldExecute(ForceExpCounter)) {
+    ForceEmitWaitcnt[EXP_CNT] = true;
+  } else {
+    ForceEmitWaitcnt[EXP_CNT] = false;
   }
 
-  // Generate an instruction for any remaining counter that needs
-  // waiting for.
-
-  for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
-    unsigned Count = getWait(Wait, CT);
-    if (Count == ~0u)
-      continue;
-
-    [[maybe_unused]] auto SWaitInst =
-        BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
-            .addImm(Count);
-
-    Modified = true;
-
-    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
-               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
-               dbgs() << "New Instr: " << *SWaitInst << '\n');
+  if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
+      DebugCounter::shouldExecute(ForceLgkmCounter)) {
+    ForceEmitWaitcnt[DS_CNT] = true;
+    ForceEmitWaitcnt[KM_CNT] = true;
+  } else {
+    ForceEmitWaitcnt[DS_CNT] = false;
+    ForceEmitWaitcnt[KM_CNT] = false;
   }
 
-  return Modified;
-}
-
-static bool readsVCCZ(const MachineInstr &MI) {
-  unsigned Opc = MI.getOpcode();
-  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
-         !MI.getOperand(1).isUndef();
-}
-
-/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
-static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
-  // Currently all conventions wait, but this may not always be the case.
-  //
-  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
-  // senses to omit the wait and do it in the caller.
-  return true;
-}
-
-/// \returns true if the callee is expected to wait for any outstanding waits
-/// before returning.
-static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
-  return true;
+  if (DebugCounter::isCounterSet(ForceVMCounter) &&
+      DebugCounter::shouldExecute(ForceVMCounter)) {
+    ForceEmitWaitcnt[LOAD_CNT] = true;
+    ForceEmitWaitcnt[SAMPLE_CNT] = true;
+    ForceEmitWaitcnt[BVH_CNT] = true;
+  } else {
+    ForceEmitWaitcnt[LOAD_CNT] = false;
+    ForceEmitWaitcnt[SAMPLE_CNT] = false;
+    ForceEmitWaitcnt[BVH_CNT] = false;
+  }
+#endif // NDEBUG
 }
 
 ///  Generate s_waitcnt instruction to be placed before cur_Inst.
@@ -1601,10 +292,9 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
 ///  scores (*_score_LB and *_score_ub respectively).
 ///  If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
 ///  flush the vmcnt counter here.
-bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
-                                                 WaitcntBrackets &ScoreBrackets,
-                                                 MachineInstr *OldWaitcntInstr,
-                                                 bool FlushVmCnt) {
+bool SIWaitCntsInserter::generateWaitcntInstBefore(
+    MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
+    MachineInstr *OldWaitcntInstr, bool FlushVmCnt, VGPRInstsSet *VGPRInsts) {
   setForceEmitWaitcnt();
 
   if (MI.isMetaInstruction())
@@ -1624,6 +314,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
     Wait.LoadCnt = 0;
   }
 
+  MachineFunction *MF = MI.getParent()->getParent();
+  bool OptNone = MF->getFunction().hasOptNone() ||
+                 MF->getTarget().getOptLevel() == CodeGenOptLevel::None;
+  InstCounterType SmemAccessCounter =
+      eventCounter(WCG->getWaitEventMask(), SMEM_ACCESS);
+
   // All waits must be resolved at call return.
   // NOTE: this could be improved with knowledge of all call sites or
   //   with knowledge of the called routines.
@@ -1644,7 +340,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
     if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone &&
         ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
         !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
-      ReleaseVGPRInsts.insert(&MI);
+      VGPRInsts->insert(&MI);
   }
   // Resolve vm waits before gs-done.
   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
@@ -1734,21 +430,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
 
       if (MI.getOperand(CallAddrOpIdx).isReg()) {
-        RegInterval CallAddrOpInterval =
+        auto [CallAddrOpLow, CallAddrOpHigh] =
             ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOpIdx);
 
-        for (int RegNo = CallAddrOpInterval.first;
-             RegNo < CallAddrOpInterval.second; ++RegNo)
+        for (int RegNo = CallAddrOpLow; RegNo < CallAddrOpHigh; ++RegNo)
           ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
 
         int RtnAddrOpIdx =
           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
         if (RtnAddrOpIdx != -1) {
-          RegInterval RtnAddrOpInterval =
+          auto [RtnAddrOpLow, RtnAddrOpHigh] =
               ScoreBrackets.getRegInterval(&MI, MRI, TRI, RtnAddrOpIdx);
 
-          for (int RegNo = RtnAddrOpInterval.first;
-               RegNo < RtnAddrOpInterval.second; ++RegNo)
+          for (int RegNo = RtnAddrOpLow; RegNo < RtnAddrOpHigh; ++RegNo)
             ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
         }
       }
@@ -1816,10 +510,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
           continue;
 
-        RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I);
+        auto [RegLow, RegHigh] = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I);
 
         const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
-        for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        for (int RegNo = RegLow; RegNo < RegHigh; ++RegNo) {
           if (IsVGPR) {
             // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
             // previous write and this write are the same type of VMEM
@@ -1867,361 +561,39 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   ScoreBrackets.simplifyWaitcnt(Wait);
 
   if (ForceEmitZeroWaitcnts)
-    Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
-
-  if (ForceEmitWaitcnt[LOAD_CNT])
-    Wait.LoadCnt = 0;
-  if (ForceEmitWaitcnt[EXP_CNT])
-    Wait.ExpCnt = 0;
-  if (ForceEmitWaitcnt[DS_CNT])
-    Wait.DsCnt = 0;
-  if (ForceEmitWaitcnt[SAMPLE_CNT])
-    Wait.SampleCnt = 0;
-  if (ForceEmitWaitcnt[BVH_CNT])
-    Wait.BvhCnt = 0;
-  if (ForceEmitWaitcnt[KM_CNT])
-    Wait.KmCnt = 0;
-
-  if (FlushVmCnt) {
-    if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
-      Wait.LoadCnt = 0;
-    if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
-      Wait.SampleCnt = 0;
-    if (ScoreBrackets.hasPendingEvent(BVH_CNT))
-      Wait.BvhCnt = 0;
-  }
-
-  return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
-                         OldWaitcntInstr);
-}
-
-// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the
-// end of the given block if needed.
-bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
-                                               WaitcntBrackets &ScoreBrackets,
-                                               MachineInstr *OldWaitcntInstr) {
-  AMDGPU::Waitcnt Wait;
-
-  unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT);
-  unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT);
-  unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT);
-
-  if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0)
-    return false;
-
-  if (LoadCntPending != 0)
-    Wait.LoadCnt = 0;
-  if (SampleCntPending != 0)
-    Wait.SampleCnt = 0;
-  if (BvhCntPending != 0)
-    Wait.BvhCnt = 0;
-
-  return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
-                         OldWaitcntInstr);
-}
-
-bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
-                                       MachineBasicBlock::instr_iterator It,
-                                       MachineBasicBlock &Block,
-                                       WaitcntBrackets &ScoreBrackets,
-                                       MachineInstr *OldWaitcntInstr) {
-  bool Modified = false;
-
-  if (OldWaitcntInstr)
-    // Try to merge the required wait with preexisting waitcnt instructions.
-    // Also erase redundant waitcnt.
-    Modified =
-        WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
-
-  // Any counts that could have been applied to any existing waitcnt
-  // instructions will have been done so, now deal with any remaining.
-  ScoreBrackets.applyWaitcnt(Wait);
-
-  // ExpCnt can be merged into VINTERP.
-  if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
-      SIInstrInfo::isVINTERP(*It)) {
-    MachineOperand *WaitExp =
-        TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
-    if (Wait.ExpCnt < WaitExp->getImm()) {
-      WaitExp->setImm(Wait.ExpCnt);
-      Modified = true;
-    }
-    Wait.ExpCnt = ~0u;
-
-    LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
-                      << "Update Instr: " << *It);
-  }
-
-  if (WCG->createNewWaitcnt(Block, It, Wait))
-    Modified = true;
-
-  return Modified;
-}
-
-// This is a flat memory operation. Check to see if it has memory tokens other
-// than LDS. Other address spaces supported by flat memory operations involve
-// global memory.
-bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
-  assert(TII->isFLAT(MI));
-
-  // All flat instructions use the VMEM counter.
-  assert(TII->usesVM_CNT(MI));
-
-  // If there are no memory operands then conservatively assume the flat
-  // operation may access VMEM.
-  if (MI.memoperands_empty())
-    return true;
-
-  // See if any memory operand specifies an address space that involves VMEM.
-  // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
-  // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
-  // (GDS) address space is not supported by flat operations. Therefore, simply
-  // return true unless only the LDS address space is found.
-  for (const MachineMemOperand *Memop : MI.memoperands()) {
-    unsigned AS = Memop->getAddrSpace();
-    assert(AS != AMDGPUAS::REGION_ADDRESS);
-    if (AS != AMDGPUAS::LOCAL_ADDRESS)
-      return true;
-  }
-
-  return false;
-}
-
-// This is a flat memory operation. Check to see if it has memory tokens for
-// either LDS or FLAT.
-bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
-  assert(TII->isFLAT(MI));
-
-  // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
-  if (!TII->usesLGKM_CNT(MI))
-    return false;
-
-  // If in tgsplit mode then there can be no use of LDS.
-  if (ST->isTgSplitEnabled())
-    return false;
-
-  // If there are no memory operands then conservatively assume the flat
-  // operation may access LDS.
-  if (MI.memoperands_empty())
-    return true;
-
-  // See if any memory operand specifies an address space that involves LDS.
-  for (const MachineMemOperand *Memop : MI.memoperands()) {
-    unsigned AS = Memop->getAddrSpace();
-    if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
-      return true;
-  }
-
-  return false;
-}
-
-// This is a flat memory operation. Check to see if it has memory tokens for
-// either scratch or FLAT.
-bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
-    const MachineInstr &MI) const {
-  assert(TII->isFLAT(MI));
-
-  // SCRATCH instructions always access scratch.
-  if (TII->isFLATScratch(MI))
-    return true;
-
-  // GLOBAL instructions never access scratch.
-  if (TII->isFLATGlobal(MI))
-    return false;
-
-  // If there are no memory operands then conservatively assume the flat
-  // operation may access scratch.
-  if (MI.memoperands_empty())
-    return true;
-
-  // See if any memory operand specifies an address space that involves scratch.
-  return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
-    unsigned AS = Memop->getAddrSpace();
-    return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
-  });
-}
-
-static bool isCacheInvOrWBInst(MachineInstr &Inst) {
-  auto Opc = Inst.getOpcode();
-  return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
-         Opc == AMDGPU::GLOBAL_WBINV;
-}
-
-void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
-                                               WaitcntBrackets *ScoreBrackets) {
-  // Now look at the instruction opcode. If it is a memory access
-  // instruction, update the upper-bound of the appropriate counter's
-  // bracket and the destination operand scores.
-  // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
-
-  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
-    if (TII->isAlwaysGDS(Inst.getOpcode()) ||
-        TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
-    } else {
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
-    }
-  } else if (TII->isFLAT(Inst)) {
-    // TODO: Track this properly.
-    if (isCacheInvOrWBInst(Inst))
-      return;
-
-    assert(Inst.mayLoadOrStore());
-
-    int FlatASCount = 0;
-
-    if (mayAccessVMEMThroughFlat(Inst)) {
-      ++FlatASCount;
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
-                                   Inst);
-    }
-
-    if (mayAccessLDSThroughFlat(Inst)) {
-      ++FlatASCount;
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
-    }
-
-    // A Flat memory operation must access at least one address space.
-    assert(FlatASCount);
-
-    // This is a flat memory operation that access both VMEM and LDS, so note it
-    // - it will require that both the VM and LGKM be flushed to zero if it is
-    // pending when a VM or LGKM dependency occurs.
-    if (FlatASCount > 1)
-      ScoreBrackets->setPendingFlat();
-  } else if (SIInstrInfo::isVMEM(Inst) &&
-             !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
-    ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
-                                 Inst);
-
-    if (ST->vmemWriteNeedsExpWaitcnt() &&
-        (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
-    }
-  } else if (TII->isSMRD(Inst)) {
-    ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
-  } else if (Inst.isCall()) {
-    if (callWaitsOnFunctionReturn(Inst)) {
-      // Act as a wait on everything
-      ScoreBrackets->applyWaitcnt(
-          WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
-      ScoreBrackets->setStateOnFunctionEntryOrReturn();
-    } else {
-      // May need to way wait for anything.
-      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
-    }
-  } else if (SIInstrInfo::isLDSDIR(Inst)) {
-    ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
-  } else if (TII->isVINTERP(Inst)) {
-    int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
-    ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
-  } else if (SIInstrInfo::isEXP(Inst)) {
-    unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
-    if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
-    else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
-    else
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
-  } else {
-    switch (Inst.getOpcode()) {
-    case AMDGPU::S_SENDMSG:
-    case AMDGPU::S_SENDMSG_RTN_B32:
-    case AMDGPU::S_SENDMSG_RTN_B64:
-    case AMDGPU::S_SENDMSGHALT:
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
-      break;
-    case AMDGPU::S_MEMTIME:
-    case AMDGPU::S_MEMREALTIME:
-    case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
-    case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
-    case AMDGPU::S_BARRIER_LEAVE:
-    case AMDGPU::S_GET_BARRIER_STATE_M0:
-    case AMDGPU::S_GET_BARRIER_STATE_IMM:
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
-      break;
-    }
-  }
-}
-
-bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
-                                 unsigned OtherScore) {
-  unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
-  unsigned OtherShifted =
-      OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
-  Score = std::max(MyShifted, OtherShifted);
-  return OtherShifted > MyShifted;
-}
+    Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
 
-/// Merge the pending events and associater score brackets of \p Other into
-/// this brackets status.
-///
-/// Returns whether the merge resulted in a change that requires tighter waits
-/// (i.e. the merged brackets strictly dominate the original brackets).
-bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
-  bool StrictDom = false;
-
-  VgprUB = std::max(VgprUB, Other.VgprUB);
-  SgprUB = std::max(SgprUB, Other.SgprUB);
-
-  for (auto T : inst_counter_types(MaxCounter)) {
-    // Merge event flags for this counter
-    const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
-    const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
-    if (OtherEvents & ~OldEvents)
-      StrictDom = true;
-    PendingEvents |= OtherEvents;
-
-    // Merge scores for this counter
-    const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
-    const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
-    const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
-    if (NewUB < ScoreLBs[T])
-      report_fatal_error("waitcnt score overflow");
-
-    MergeInfo M;
-    M.OldLB = ScoreLBs[T];
-    M.OtherLB = Other.ScoreLBs[T];
-    M.MyShift = NewUB - ScoreUBs[T];
-    M.OtherShift = NewUB - Other.ScoreUBs[T];
-
-    ScoreUBs[T] = NewUB;
-
-    StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
-
-    for (int J = 0; J <= VgprUB; J++)
-      StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
-
-    if (T == SmemAccessCounter) {
-      for (int J = 0; J <= SgprUB; J++)
-        StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
-    }
-  }
+  if (ForceEmitWaitcnt[LOAD_CNT])
+    Wait.LoadCnt = 0;
+  if (ForceEmitWaitcnt[EXP_CNT])
+    Wait.ExpCnt = 0;
+  if (ForceEmitWaitcnt[DS_CNT])
+    Wait.DsCnt = 0;
+  if (ForceEmitWaitcnt[SAMPLE_CNT])
+    Wait.SampleCnt = 0;
+  if (ForceEmitWaitcnt[BVH_CNT])
+    Wait.BvhCnt = 0;
+  if (ForceEmitWaitcnt[KM_CNT])
+    Wait.KmCnt = 0;
 
-  for (int J = 0; J <= VgprUB; J++) {
-    unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
-    StrictDom |= NewVmemTypes != VgprVmemTypes[J];
-    VgprVmemTypes[J] = NewVmemTypes;
+  if (FlushVmCnt) {
+    if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
+      Wait.LoadCnt = 0;
+    if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
+      Wait.SampleCnt = 0;
+    if (ScoreBrackets.hasPendingEvent(BVH_CNT))
+      Wait.BvhCnt = 0;
   }
 
-  return StrictDom;
-}
-
-static bool isWaitInstr(MachineInstr &Inst) {
-  unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
-  return Opcode == AMDGPU::S_WAITCNT ||
-         (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
-          Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
-         Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
-         Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
-         counterTypeForInstr(Opcode).has_value();
+  return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
+                         OldWaitcntInstr);
 }
 
 // Generate s_waitcnt instructions where needed.
-bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
-                                            MachineBasicBlock &Block,
-                                            WaitcntBrackets &ScoreBrackets) {
+bool SIWaitCntsInserter::insertWaitcntInBlock(MachineFunction &MF,
+                                              MachineBasicBlock &Block,
+                                              WaitcntBrackets &ScoreBrackets,
+                                              VGPRInstsSet *VGPRInsts) {
   bool Modified = false;
 
   LLVM_DEBUG({
@@ -2265,7 +637,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
     // Generate an s_waitcnt instruction to be placed before Inst, if needed.
     Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
-                                          FlushVmCnt);
+                                          FlushVmCnt, VGPRInsts);
     OldWaitcntInstr = nullptr;
 
     // Restore vccz if it's not known to be correct already.
@@ -2355,324 +727,165 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
   return Modified;
 }
 
-// Return true if the given machine basic block is a preheader of a loop in
-// which we want to flush the vmcnt counter, and false otherwise.
-bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
-                                          WaitcntBrackets &ScoreBrackets) {
-  auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
-  if (!IsInserted)
-    return Iterator->second;
-
-  MachineBasicBlock *Succ = MBB.getSingleSuccessor();
-  if (!Succ)
-    return false;
-
-  MachineLoop *Loop = MLI->getLoopFor(Succ);
-  if (!Loop)
-    return false;
+void SIWaitCntsInserter::updateEventWaitcntAfter(
+    MachineInstr &Inst, WaitcntBrackets *ScoreBrackets) {
+  // Now look at the instruction opcode. If it is a memory access
+  // instruction, update the upper-bound of the appropriate counter's
+  // bracket and the destination operand scores.
+  // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
 
-  if (Loop->getLoopPreheader() == &MBB &&
-      shouldFlushVmCnt(Loop, ScoreBrackets)) {
-    Iterator->second = true;
-    return true;
-  }
+  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
+    if (TII->isAlwaysGDS(Inst.getOpcode()) ||
+        TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
+    } else {
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
+    }
+  } else if (TII->isFLAT(Inst)) {
+    // TODO: Track this properly.
+    if (isCacheInvOrWBInst(Inst))
+      return;
 
-  return false;
-}
+    assert(Inst.mayLoadOrStore());
 
-bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
-  return SIInstrInfo::isVMEM(MI) ||
-         (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
-}
+    int FlatASCount = 0;
 
-// Return true if it is better to flush the vmcnt counter in the preheader of
-// the given loop. We currently decide to flush in two situations:
-// 1. The loop contains vmem store(s), no vmem load and at least one use of a
-//    vgpr containing a value that is loaded outside of the loop. (Only on
-//    targets with no vscnt counter).
-// 2. The loop contains vmem load(s), but the loaded values are not used in the
-//    loop, and at least one use of a vgpr containing a value that is loaded
-//    outside of the loop.
-bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
-                                        WaitcntBrackets &Brackets) {
-  bool HasVMemLoad = false;
-  bool HasVMemStore = false;
-  bool UsesVgprLoadedOutside = false;
-  DenseSet<Register> VgprUse;
-  DenseSet<Register> VgprDef;
+    if (mayAccessVMEMThroughFlat(Inst)) {
+      ++FlatASCount;
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
+                                   Inst);
+    }
 
-  for (MachineBasicBlock *MBB : ML->blocks()) {
-    for (MachineInstr &MI : *MBB) {
-      if (isVMEMOrFlatVMEM(MI)) {
-        if (MI.mayLoad())
-          HasVMemLoad = true;
-        if (MI.mayStore())
-          HasVMemStore = true;
-      }
-      for (unsigned I = 0; I < MI.getNumOperands(); I++) {
-        MachineOperand &Op = MI.getOperand(I);
-        if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
-          continue;
-        RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, I);
-        // Vgpr use
-        if (Op.isUse()) {
-          for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-            // If we find a register that is loaded inside the loop, 1. and 2.
-            // are invalidated and we can exit.
-            if (VgprDef.contains(RegNo))
-              return false;
-            VgprUse.insert(RegNo);
-            // If at least one of Op's registers is in the score brackets, the
-            // value is likely loaded outside of the loop.
-            if (Brackets.getRegScore(RegNo, LOAD_CNT) >
-                    Brackets.getScoreLB(LOAD_CNT) ||
-                Brackets.getRegScore(RegNo, SAMPLE_CNT) >
-                    Brackets.getScoreLB(SAMPLE_CNT) ||
-                Brackets.getRegScore(RegNo, BVH_CNT) >
-                    Brackets.getScoreLB(BVH_CNT)) {
-              UsesVgprLoadedOutside = true;
-              break;
-            }
-          }
-        }
-        // VMem load vgpr def
-        else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
-          for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-            // If we find a register that is loaded inside the loop, 1. and 2.
-            // are invalidated and we can exit.
-            if (VgprUse.contains(RegNo))
-              return false;
-            VgprDef.insert(RegNo);
-          }
-      }
+    if (mayAccessLDSThroughFlat(Inst)) {
+      ++FlatASCount;
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
     }
-  }
-  if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
-    return true;
-  return HasVMemLoad && UsesVgprLoadedOutside;
-}
 
-bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
-  ST = &MF.getSubtarget<GCNSubtarget>();
-  TII = ST->getInstrInfo();
-  TRI = &TII->getRegisterInfo();
-  MRI = &MF.getRegInfo();
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  MLI = &getAnalysis<MachineLoopInfo>();
-  PDT = &getAnalysis<MachinePostDominatorTree>();
-  if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
-    AA = &AAR->getAAResults();
+    // A Flat memory operation must access at least one address space.
+    assert(FlatASCount);
 
-  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
+    // This is a flat memory operation that access both VMEM and LDS, so note it
+    // - it will require that both the VM and LGKM be flushed to zero if it is
+    // pending when a VM or LGKM dependency occurs.
+    if (FlatASCount > 1)
+      ScoreBrackets->setPendingFlat();
+  } else if (SIInstrInfo::isVMEM(Inst) &&
+             !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
+                                 Inst);
 
-  if (ST->hasExtendedWaitCounts()) {
-    MaxCounter = NUM_EXTENDED_INST_CNTS;
-    WCGGFX12Plus = WaitcntGeneratorGFX12Plus(ST, MaxCounter);
-    WCG = &WCGGFX12Plus;
+    if (ST->vmemWriteNeedsExpWaitcnt() &&
+        (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
+    }
+  } else if (TII->isSMRD(Inst)) {
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+  } else if (Inst.isCall()) {
+    if (callWaitsOnFunctionReturn(Inst)) {
+      // Act as a wait on everything
+      ScoreBrackets->applyWaitcnt(
+          WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
+      ScoreBrackets->setStateOnFunctionEntryOrReturn();
+    } else {
+      // May need to way wait for anything.
+      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
+    }
+  } else if (SIInstrInfo::isLDSDIR(Inst)) {
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
+  } else if (TII->isVINTERP(Inst)) {
+    int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
+    ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
+  } else if (SIInstrInfo::isEXP(Inst)) {
+    unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
+    if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
+    else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
+    else
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
   } else {
-    MaxCounter = NUM_NORMAL_INST_CNTS;
-    WCGPreGFX12 = WaitcntGeneratorPreGFX12(ST);
-    WCG = &WCGPreGFX12;
+    switch (Inst.getOpcode()) {
+    case AMDGPU::S_SENDMSG:
+    case AMDGPU::S_SENDMSG_RTN_B32:
+    case AMDGPU::S_SENDMSG_RTN_B64:
+    case AMDGPU::S_SENDMSGHALT:
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
+      break;
+    case AMDGPU::S_MEMTIME:
+    case AMDGPU::S_MEMREALTIME:
+    case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
+    case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
+    case AMDGPU::S_BARRIER_LEAVE:
+    case AMDGPU::S_GET_BARRIER_STATE_M0:
+    case AMDGPU::S_GET_BARRIER_STATE_IMM:
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+      break;
+    }
   }
+}
 
-  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
-  for (auto T : inst_counter_types())
-    ForceEmitWaitcnt[T] = false;
-
-  const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
-
-  SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
-
-  OptNone = MF.getFunction().hasOptNone() ||
-            MF.getTarget().getOptLevel() == CodeGenOptLevel::None;
+class SIInsertWaitcnts : public MachineFunctionPass {
+public:
+  static char ID;
 
-  HardwareLimits Limits = {};
-  if (ST->hasExtendedWaitCounts()) {
-    Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
-    Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
-  } else {
-    Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
-    Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
+  SIInsertWaitcnts() : MachineFunctionPass(ID) {
+    (void)ForceExpCounter;
+    (void)ForceLgkmCounter;
+    (void)ForceVMCounter;
   }
-  Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
-  Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
-  Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
-  Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
-  Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
-
-  unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
-  unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
-  assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
-  assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
-
-  RegisterEncoding Encoding = {};
-  Encoding.VGPR0 =
-      TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
-  Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
-  Encoding.SGPR0 =
-      TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
-  Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
-
-  BlockInfos.clear();
-  bool Modified = false;
-
-  MachineBasicBlock &EntryBB = MF.front();
-  MachineBasicBlock::iterator I = EntryBB.begin();
-
-  if (!MFI->isEntryFunction()) {
-    // Wait for any outstanding memory operations that the input registers may
-    // depend on. We can't track them and it's better to do the wait after the
-    // costly call sequence.
-
-    // TODO: Could insert earlier and schedule more liberally with operations
-    // that only use caller preserved registers.
-    for (MachineBasicBlock::iterator E = EntryBB.end();
-         I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
-      ;
-
-    if (ST->hasExtendedWaitCounts()) {
-      BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
-          .addImm(0);
-      for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
-        if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
-          continue;
 
-        BuildMI(EntryBB, I, DebugLoc(),
-                TII->get(instrsForExtendedCounterTypes[CT]))
-            .addImm(0);
-      }
-    } else {
-      BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
-    }
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-    auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
-        ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
-        SmemAccessCounter);
-    NonKernelInitialState->setStateOnFunctionEntryOrReturn();
-    BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
+  StringRef getPassName() const override {
+    return "SI Insert Wait Instructions";
+  }
 
-    Modified = true;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<MachinePostDominatorTree>();
+    AU.addUsedIfAvailable<AAResultsWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
   }
+};
 
-  // Keep iterating over the blocks in reverse post order, inserting and
-  // updating s_waitcnt where needed, until a fix point is reached.
-  for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
-    BlockInfos.insert({MBB, BlockInfo()});
-
-  std::unique_ptr<WaitcntBrackets> Brackets;
-  bool Repeat;
-  do {
-    Repeat = false;
-
-    for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
-         ++BII) {
-      MachineBasicBlock *MBB = BII->first;
-      BlockInfo &BI = BII->second;
-      if (!BI.Dirty)
-        continue;
-
-      if (BI.Incoming) {
-        if (!Brackets)
-          Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
-        else
-          *Brackets = *BI.Incoming;
-      } else {
-        if (!Brackets)
-          Brackets = std::make_unique<WaitcntBrackets>(
-              ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
-              SmemAccessCounter);
-        else
-          *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding,
-                                      WaitEventMaskForInst, SmemAccessCounter);
-      }
+INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE,
+                      "SI Insert Wait Instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Wait Instructions",
+                    false, false)
 
-      Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
-      BI.Dirty = false;
-
-      if (Brackets->hasPendingEvent()) {
-        BlockInfo *MoveBracketsToSucc = nullptr;
-        for (MachineBasicBlock *Succ : MBB->successors()) {
-          auto SuccBII = BlockInfos.find(Succ);
-          BlockInfo &SuccBI = SuccBII->second;
-          if (!SuccBI.Incoming) {
-            SuccBI.Dirty = true;
-            if (SuccBII <= BII)
-              Repeat = true;
-            if (!MoveBracketsToSucc) {
-              MoveBracketsToSucc = &SuccBI;
-            } else {
-              SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
-            }
-          } else if (SuccBI.Incoming->merge(*Brackets)) {
-            SuccBI.Dirty = true;
-            if (SuccBII <= BII)
-              Repeat = true;
-          }
-        }
-        if (MoveBracketsToSucc)
-          MoveBracketsToSucc->Incoming = std::move(Brackets);
-      }
-    }
-  } while (Repeat);
+char SIInsertWaitcnts::ID = 0;
 
-  if (ST->hasScalarStores()) {
-    SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
-    bool HaveScalarStores = false;
+char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
 
-    for (MachineBasicBlock &MBB : MF) {
-      for (MachineInstr &MI : MBB) {
-        if (!HaveScalarStores && TII->isScalarStore(MI))
-          HaveScalarStores = true;
+FunctionPass *llvm::createSIInsertWaitcntsPass() {
+  return new SIInsertWaitcnts();
+}
 
-        if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
-            MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
-          EndPgmBlocks.push_back(&MBB);
-      }
-    }
+bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfo>();
+  MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>();
+  AliasAnalysis *AA = nullptr;
+  if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
+    AA = &AAR->getAAResults();
 
-    if (HaveScalarStores) {
-      // If scalar writes are used, the cache must be flushed or else the next
-      // wave to reuse the same scratch memory can be clobbered.
-      //
-      // Insert s_dcache_wb at wave termination points if there were any scalar
-      // stores, and only if the cache hasn't already been flushed. This could
-      // be improved by looking across blocks for flushes in postdominating
-      // blocks from the stores but an explicitly requested flush is probably
-      // very rare.
-      for (MachineBasicBlock *MBB : EndPgmBlocks) {
-        bool SeenDCacheWB = false;
-
-        for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
-             I != E; ++I) {
-          if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
-            SeenDCacheWB = true;
-          else if (TII->isScalarStore(*I))
-            SeenDCacheWB = false;
-
-          // FIXME: It would be better to insert this before a waitcnt if any.
-          if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
-               I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
-              !SeenDCacheWB) {
-            Modified = true;
-            BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
-          }
-        }
-      }
-    }
-  }
+  WaitCntGeneratorPreGFX12 WCGPreGFX12;
+  WaitCntGeneratorGFX12Plus WCGGFX12Plus;
+  InstCounterType MaxCounter;
+  WaitCntGenerator *WCG =
+      getWaitCntGenerator(MF, WCGPreGFX12, WCGGFX12Plus, MaxCounter);
 
-  // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
-  // instructions.
-  for (MachineInstr *MI : ReleaseVGPRInsts) {
-    if (ST->requiresNopBeforeDeallocVGPRs()) {
-      BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_NOP))
-          .addImm(0);
-    }
-    BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_SENDMSG))
-        .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
-    Modified = true;
-  }
-  ReleaseVGPRInsts.clear();
+  SIWaitCntsInserter WCountsInserter = SIWaitCntsInserter(
+      ST, &MF.getRegInfo(), WCG, MaxCounter, ForceEmitZeroFlag, MLI, PDT, AA);
 
-  return Modified;
+  // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
+  // message.
+  DenseSet<MachineInstr *> ReleaseVGPRInsts;
+
+  return WCountsInserter.insertWaitCntsInFunction(MF, &ReleaseVGPRInsts);
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 6826cd27319507..a347cdd2241d66 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -14,6 +14,7 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Alignment.h"
+#include "llvm/TargetParser/TargetParser.h"
 #include <array>
 #include <functional>
 #include <utility>
@@ -40,8 +41,6 @@ struct kernel_descriptor_t;
 
 namespace AMDGPU {
 
-struct IsaVersion;
-
 /// Generic target versions emitted by this version of LLVM.
 ///
 /// These numbers are incremented every time a codegen breaking change occurs
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUWaitCountUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUWaitCountUtils.cpp
new file mode 100644
index 00000000000000..e332a648e59418
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUWaitCountUtils.cpp
@@ -0,0 +1,1393 @@
+//===-- AMDGPUWaitCountUtils.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Common interface to insert various wait counts for memory operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUWaitCountUtils.h"
+#include "AMDGPU.h"
+#include "AMDGPUBaseInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+
+#define DEBUG_TYPE "amdgpu-waitcount-utils"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+namespace llvm {
+
+namespace AMDGPU {
+
+static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
+                                     unsigned NewEnc) {
+  int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
+  assert(OpIdx >= 0);
+
+  MachineOperand &MO = MI.getOperand(OpIdx);
+
+  if (NewEnc == MO.getImm())
+    return false;
+
+  MO.setImm(NewEnc);
+  return true;
+}
+
+/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
+/// and if so, which counter it is waiting on.
+static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::S_WAIT_LOADCNT:
+    return LOAD_CNT;
+  case AMDGPU::S_WAIT_EXPCNT:
+    return EXP_CNT;
+  case AMDGPU::S_WAIT_STORECNT:
+    return STORE_CNT;
+  case AMDGPU::S_WAIT_SAMPLECNT:
+    return SAMPLE_CNT;
+  case AMDGPU::S_WAIT_BVHCNT:
+    return BVH_CNT;
+  case AMDGPU::S_WAIT_DSCNT:
+    return DS_CNT;
+  case AMDGPU::S_WAIT_KMCNT:
+    return KM_CNT;
+  default:
+    return {};
+  }
+}
+
+bool updateVMCntOnly(const MachineInstr &Inst) {
+  return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
+         SIInstrInfo::isFLATScratch(Inst);
+}
+
+bool isWaitInstr(MachineInstr &Inst) {
+  unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
+  return Opcode == AMDGPU::S_WAITCNT ||
+         (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
+          Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
+         Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
+         Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
+         counterTypeForInstr(Opcode).has_value();
+}
+
+VmemType getVmemType(const MachineInstr &Inst) {
+  assert(updateVMCntOnly(Inst));
+  if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
+      !SIInstrInfo::isVSAMPLE(Inst))
+    return VMEM_NOSAMPLER;
+  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
+  const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
+      AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+  return BaseInfo->BVH       ? VMEM_BVH
+         : BaseInfo->Sampler ? VMEM_SAMPLER
+                             : VMEM_NOSAMPLER;
+}
+
+/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
+bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
+  // Currently all conventions wait, but this may not always be the case.
+  //
+  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
+  // senses to omit the wait and do it in the caller.
+  return true;
+}
+
+/// \returns true if the callee is expected to wait for any outstanding waits
+/// before returning.
+bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
+
+// Mapping from event to counter according to the table masks.
+InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
+  for (auto T : inst_counter_types()) {
+    if (masks[T] & (1 << E))
+      return T;
+  }
+  llvm_unreachable("event type has no associated counter");
+}
+
+bool readsVCCZ(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
+         !MI.getOperand(1).isUndef();
+}
+
+bool isCacheInvOrWBInst(MachineInstr &Inst) {
+  auto Opc = Inst.getOpcode();
+  return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
+         Opc == AMDGPU::GLOBAL_WBINV;
+}
+
+#ifndef NDEBUG
+static bool isNormalMode(InstCounterType MaxCounter) {
+  return MaxCounter == NUM_NORMAL_INST_CNTS;
+}
+#endif // NDEBUG
+
+unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
+  switch (T) {
+  case LOAD_CNT:
+    return Wait.LoadCnt;
+  case EXP_CNT:
+    return Wait.ExpCnt;
+  case DS_CNT:
+    return Wait.DsCnt;
+  case STORE_CNT:
+    return Wait.StoreCnt;
+  case SAMPLE_CNT:
+    return Wait.SampleCnt;
+  case BVH_CNT:
+    return Wait.BvhCnt;
+  case KM_CNT:
+    return Wait.KmCnt;
+  default:
+    llvm_unreachable("bad InstCounterType");
+  }
+}
+
+void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
+  unsigned &WC = getCounterRef(Wait, T);
+  WC = std::min(WC, Count);
+}
+
+void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
+  getCounterRef(Wait, T) = ~0u;
+}
+
+unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
+  return getCounterRef(Wait, T);
+}
+
+WaitCntGenerator *getWaitCntGenerator(MachineFunction &MF,
+                                      WaitCntGeneratorPreGFX12 &WCGPreGFX12,
+                                      WaitCntGeneratorGFX12Plus &WCGGFX12Plus,
+                                      InstCounterType &MaxCounter) {
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  WaitCntGenerator *WCG = nullptr;
+
+  if (ST->hasExtendedWaitCounts()) {
+    MaxCounter = NUM_EXTENDED_INST_CNTS;
+    WCGGFX12Plus = WaitCntGeneratorGFX12Plus(ST, MaxCounter);
+    WCG = &WCGGFX12Plus;
+  } else {
+    MaxCounter = NUM_NORMAL_INST_CNTS;
+    WCGPreGFX12 = WaitCntGeneratorPreGFX12(ST);
+    WCG = &WCGPreGFX12;
+  }
+
+  return WCG;
+}
+
+//===----------------------------------------------------------------------===//
+// WaitcntBrackets member functions.
+//===----------------------------------------------------------------------===//
+
+RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
+                                            const MachineRegisterInfo *MRI,
+                                            const SIRegisterInfo *TRI,
+                                            unsigned OpNo) const {
+  const MachineOperand &Op = MI->getOperand(OpNo);
+  if (!TRI->isInAllocatableClass(Op.getReg()))
+    return {-1, -1};
+
+  // A use via a PW operand does not need a waitcnt.
+  // A partial write is not a WAW.
+  assert(!Op.getSubReg() || !Op.isUndef());
+
+  RegInterval Result;
+
+  unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
+                 AMDGPU::HWEncoding::REG_IDX_MASK;
+
+  if (TRI->isVectorRegister(*MRI, Op.getReg())) {
+    assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
+    Result.first = Reg - Encoding.VGPR0;
+    if (TRI->isAGPR(*MRI, Op.getReg()))
+      Result.first += AGPR_OFFSET;
+    assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
+  } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
+    assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
+    Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
+    assert(Result.first >= NUM_ALL_VGPRS &&
+           Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
+  }
+  // TODO: Handle TTMP
+  // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
+  else
+    return {-1, -1};
+
+  const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
+  unsigned Size = TRI->getRegSizeInBits(*RC);
+  Result.second = Result.first + ((Size + 16) / 32);
+
+  return Result;
+}
+
+void WaitcntBrackets::setExpScore(const MachineInstr *MI,
+                                  const SIInstrInfo *TII,
+                                  const SIRegisterInfo *TRI,
+                                  const MachineRegisterInfo *MRI, unsigned OpNo,
+                                  unsigned Val) {
+  auto [RegLow, RegHigh] = getRegInterval(MI, MRI, TRI, OpNo);
+  assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
+  for (int RegNo = RegLow; RegNo < RegHigh; ++RegNo) {
+    setRegScore(RegNo, EXP_CNT, Val);
+  }
+}
+
+void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
+                                    const SIRegisterInfo *TRI,
+                                    const MachineRegisterInfo *MRI,
+                                    WaitEventType E, MachineInstr &Inst) {
+  InstCounterType T = eventCounter(WaitEventMaskForInst, E);
+
+  unsigned UB = getScoreUB(T);
+  unsigned CurrScore = UB + 1;
+  if (CurrScore == 0)
+    report_fatal_error("InsertWaitcnt score wraparound");
+  // PendingEvents and ScoreUB need to be update regardless if this event
+  // changes the score of a register or not.
+  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
+  PendingEvents |= 1 << E;
+  setScoreUB(T, CurrScore);
+
+  if (T == EXP_CNT) {
+    // Put score on the source vgprs. If this is a store, just use those
+    // specific register(s).
+    if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
+      int AddrOpIdx =
+          AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
+      // All GDS operations must protect their address register (same as
+      // export.)
+      if (AddrOpIdx != -1) {
+        setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
+      }
+
+      if (Inst.mayStore()) {
+        if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
+          setExpScore(&Inst, TII, TRI, MRI,
+                      AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                                 AMDGPU::OpName::data0),
+                      CurrScore);
+        }
+        if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
+          setExpScore(&Inst, TII, TRI, MRI,
+                      AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                                 AMDGPU::OpName::data1),
+                      CurrScore);
+        }
+      } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
+                 Inst.getOpcode() != AMDGPU::DS_APPEND &&
+                 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
+                 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
+        for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+          const MachineOperand &Op = Inst.getOperand(I);
+          if (Op.isReg() && !Op.isDef() &&
+              TRI->isVectorRegister(*MRI, Op.getReg())) {
+            setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
+          }
+        }
+      }
+    } else if (TII->isFLAT(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      } else if (SIInstrInfo::isAtomicRet(Inst)) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      }
+    } else if (TII->isMIMG(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+      } else if (SIInstrInfo::isAtomicRet(Inst)) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      }
+    } else if (TII->isMTBUF(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+      }
+    } else if (TII->isMUBUF(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+      } else if (SIInstrInfo::isAtomicRet(Inst)) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      }
+    } else if (TII->isLDSDIR(Inst)) {
+      // LDSDIR instructions attach the score to the destination.
+      setExpScore(
+          &Inst, TII, TRI, MRI,
+          AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
+          CurrScore);
+    } else {
+      if (TII->isEXP(Inst)) {
+        // For export the destination registers are really temps that
+        // can be used as the actual source after export patching, so
+        // we need to treat them like sources and set the EXP_CNT
+        // score.
+        for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+          MachineOperand &DefMO = Inst.getOperand(I);
+          if (DefMO.isReg() && DefMO.isDef() &&
+              TRI->isVGPR(*MRI, DefMO.getReg())) {
+            setRegScore(
+                TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
+                EXP_CNT, CurrScore);
+          }
+        }
+      }
+      for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+        MachineOperand &MO = Inst.getOperand(I);
+        if (MO.isReg() && !MO.isDef() &&
+            TRI->isVectorRegister(*MRI, MO.getReg())) {
+          setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
+        }
+      }
+    }
+#if 0 // TODO: check if this is handled by MUBUF code above.
+  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
+       Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
+       Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
+    MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
+    unsigned OpNo;//TODO: find the OpNo for this operand;
+    auto [RegLow, RegHigh] = getRegInterval(&Inst, MRI, TRI, OpNo);
+    for (int RegNo = RegLow; RegNo < RegHigh;
+    ++RegNo) {
+      setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
+    }
+#endif
+  } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
+    // Match the score to the destination registers.
+    for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+      auto &Op = Inst.getOperand(I);
+      if (!Op.isReg() || !Op.isDef())
+        continue;
+      auto [RegLow, RegHigh] = getRegInterval(&Inst, MRI, TRI, I);
+      if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
+        if (RegLow >= NUM_ALL_VGPRS)
+          continue;
+        if (updateVMCntOnly(Inst)) {
+          // updateVMCntOnly should only leave us with VGPRs
+          // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
+          // defs. That's required for a sane index into `VgprMemTypes` below
+          assert(TRI->isVectorRegister(*MRI, Op.getReg()));
+          VmemType V = getVmemType(Inst);
+          for (int RegNo = RegLow; RegNo < RegHigh; ++RegNo)
+            VgprVmemTypes[RegNo] |= 1 << V;
+        }
+      }
+      for (int RegNo = RegLow; RegNo < RegHigh; ++RegNo) {
+        setRegScore(RegNo, T, CurrScore);
+      }
+    }
+    if (Inst.mayStore() &&
+        (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
+      // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
+      // written can be accessed. A load from LDS to VMEM does not need a wait.
+      unsigned Slot = 0;
+      for (const auto *MemOp : Inst.memoperands()) {
+        if (!MemOp->isStore() ||
+            MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
+          continue;
+        // Comparing just AA info does not guarantee memoperands are equal
+        // in general, but this is so for LDS DMA in practice.
+        auto AAI = MemOp->getAAInfo();
+        // Alias scope information gives a way to definitely identify an
+        // original memory object and practically produced in the module LDS
+        // lowering pass. If there is no scope available we will not be able
+        // to disambiguate LDS aliasing as after the module lowering all LDS
+        // is squashed into a single big object. Do not attempt to use one of
+        // the limited LDSDMAStores for something we will not be able to use
+        // anyway.
+        if (!AAI || !AAI.Scope)
+          break;
+        for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
+          for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
+            if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
+              Slot = I + 1;
+              break;
+            }
+          }
+        }
+        if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
+          break;
+        LDSDMAStores.push_back(&Inst);
+        Slot = LDSDMAStores.size();
+        break;
+      }
+      setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
+      if (Slot)
+        setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
+    }
+  }
+}
+
+void WaitcntBrackets::print(raw_ostream &OS) {
+  OS << '\n';
+  for (auto T : inst_counter_types(MaxCounter)) {
+    unsigned SR = getScoreRange(T);
+
+    switch (T) {
+    case LOAD_CNT:
+      OS << "    " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
+         << SR << "): ";
+      break;
+    case DS_CNT:
+      OS << "    " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
+         << SR << "): ";
+      break;
+    case EXP_CNT:
+      OS << "    EXP_CNT(" << SR << "): ";
+      break;
+    case STORE_CNT:
+      OS << "    " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
+         << SR << "): ";
+      break;
+    case SAMPLE_CNT:
+      OS << "    SAMPLE_CNT(" << SR << "): ";
+      break;
+    case BVH_CNT:
+      OS << "    BVH_CNT(" << SR << "): ";
+      break;
+    case KM_CNT:
+      OS << "    KM_CNT(" << SR << "): ";
+      break;
+    default:
+      OS << "    UNKNOWN(" << SR << "): ";
+      break;
+    }
+
+    if (SR != 0) {
+      // Print vgpr scores.
+      unsigned LB = getScoreLB(T);
+
+      for (int J = 0; J <= VgprUB; J++) {
+        unsigned RegScore = getRegScore(J, T);
+        if (RegScore <= LB)
+          continue;
+        unsigned RelScore = RegScore - LB - 1;
+        if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
+          OS << RelScore << ":v" << J << " ";
+        } else {
+          OS << RelScore << ":ds ";
+        }
+      }
+      // Also need to print sgpr scores for lgkm_cnt.
+      if (T == SmemAccessCounter) {
+        for (int J = 0; J <= SgprUB; J++) {
+          unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
+          if (RegScore <= LB)
+            continue;
+          unsigned RelScore = RegScore - LB - 1;
+          OS << RelScore << ":s" << J << " ";
+        }
+      }
+    }
+    OS << '\n';
+  }
+  OS << '\n';
+}
+
+/// Simplify the waitcnt, in the sense of removing redundant counts, and return
+/// whether a waitcnt instruction is needed at all.
+void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
+  simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
+  simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
+  simplifyWaitcnt(DS_CNT, Wait.DsCnt);
+  simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
+  simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
+  simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
+  simplifyWaitcnt(KM_CNT, Wait.KmCnt);
+}
+
+void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
+                                      unsigned &Count) const {
+  // The number of outstanding events for this type, T, can be calculated
+  // as (UB - LB). If the current Count is greater than or equal to the number
+  // of outstanding events, then the wait for this counter is redundant.
+  if (Count >= getScoreRange(T))
+    Count = ~0u;
+}
+
+void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
+                                    AMDGPU::Waitcnt &Wait) const {
+  unsigned ScoreToWait = getRegScore(RegNo, T);
+
+  // If the score of src_operand falls within the bracket, we need an
+  // s_waitcnt instruction.
+  const unsigned LB = getScoreLB(T);
+  const unsigned UB = getScoreUB(T);
+  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
+    if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
+        !ST->hasFlatLgkmVMemCountInOrder()) {
+      // If there is a pending FLAT operation, and this is a VMem or LGKM
+      // waitcnt and the target can report early completion, then we need
+      // to force a waitcnt 0.
+      addWait(Wait, T, 0);
+    } else if (counterOutOfOrder(T)) {
+      // Counter can get decremented out-of-order when there
+      // are multiple types event in the bracket. Also emit an s_wait counter
+      // with a conservative value of 0 for the counter.
+      addWait(Wait, T, 0);
+    } else {
+      // If a counter has been maxed out avoid overflow by waiting for
+      // MAX(CounterType) - 1 instead.
+      unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
+      addWait(Wait, T, NeededWait);
+    }
+  }
+}
+
+void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
+  applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
+  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
+  applyWaitcnt(DS_CNT, Wait.DsCnt);
+  applyWaitcnt(STORE_CNT, Wait.StoreCnt);
+  applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
+  applyWaitcnt(BVH_CNT, Wait.BvhCnt);
+  applyWaitcnt(KM_CNT, Wait.KmCnt);
+}
+
+void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
+  const unsigned UB = getScoreUB(T);
+  if (Count >= UB)
+    return;
+  if (Count != 0) {
+    if (counterOutOfOrder(T))
+      return;
+    setScoreLB(T, std::max(getScoreLB(T), UB - Count));
+  } else {
+    setScoreLB(T, UB);
+    PendingEvents &= ~WaitEventMaskForInst[T];
+  }
+}
+
+// Where there are multiple types of event in the bracket of a counter,
+// the decrement may go out of order.
+bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
+  // Scalar memory read always can go out of order.
+  if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
+    return true;
+  return hasMixedPendingEvents(T);
+}
+
+WaitCntBitMaskFn WaitcntBrackets::getWaitCntBitMaskFn(InstCounterType T) {
+  switch (T) {
+  case LOAD_CNT:
+    if (ST->hasExtendedWaitCounts())
+      return getLoadcntBitMask;
+
+    return getVmcntBitMask;
+  case DS_CNT:
+    if (ST->hasExtendedWaitCounts())
+      return getDscntBitMask;
+
+    return getLgkmcntBitMask;
+  case EXP_CNT:
+    return getExpcntBitMask;
+  case STORE_CNT:
+    return getStorecntBitMask;
+  case SAMPLE_CNT:
+    return getSamplecntBitMask;
+  case BVH_CNT:
+    return getBvhcntBitMask;
+  case KM_CNT:
+    return getKmcntBitMask;
+  default:
+    llvm_unreachable("bad InstCounterType in getWaitCntBitMaskFn");
+  }
+}
+
+bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
+                                 unsigned OtherScore) {
+  unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
+  unsigned OtherShifted =
+      OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
+  Score = std::max(MyShifted, OtherShifted);
+  return OtherShifted > MyShifted;
+}
+
+/// Merge the pending events and associater score brackets of \p Other into
+/// this brackets status.
+///
+/// Returns whether the merge resulted in a change that requires tighter waits
+/// (i.e. the merged brackets strictly dominate the original brackets).
+bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
+  bool StrictDom = false;
+
+  VgprUB = std::max(VgprUB, Other.VgprUB);
+  SgprUB = std::max(SgprUB, Other.SgprUB);
+
+  for (auto T : inst_counter_types(MaxCounter)) {
+    // Merge event flags for this counter
+    const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
+    const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
+    if (OtherEvents & ~OldEvents)
+      StrictDom = true;
+    PendingEvents |= OtherEvents;
+
+    // Merge scores for this counter
+    const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
+    const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
+    const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
+    if (NewUB < ScoreLBs[T])
+      report_fatal_error("waitcnt score overflow");
+
+    MergeInfo M;
+    M.OldLB = ScoreLBs[T];
+    M.OtherLB = Other.ScoreLBs[T];
+    M.MyShift = NewUB - ScoreUBs[T];
+    M.OtherShift = NewUB - Other.ScoreUBs[T];
+
+    ScoreUBs[T] = NewUB;
+
+    StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
+
+    for (int J = 0; J <= VgprUB; J++)
+      StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
+
+    if (isSmemCounter(T)) {
+      for (int J = 0; J <= SgprUB; J++)
+        StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
+    }
+  }
+
+  for (int J = 0; J <= VgprUB; J++) {
+    unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
+    StrictDom |= NewVmemTypes != VgprVmemTypes[J];
+    VgprVmemTypes[J] = NewVmemTypes;
+  }
+
+  return StrictDom;
+}
+
+//===----------------------------------------------------------------------===//
+// WaitCntGeneratorPreGFX12 member functions.
+//===----------------------------------------------------------------------===//
+
+AMDGPU::Waitcnt
+WaitCntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
+  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
+}
+
+/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
+/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
+/// from \p Wait that were added by previous passes. Currently this pass
+/// conservatively assumes that these preexisting waits are required for
+/// correctness.
+bool WaitCntGeneratorPreGFX12::applyPreexistingWaitcnt(
+    WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
+    AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
+  assert(ST);
+  assert(isNormalMode(MaxCounter));
+
+  bool Modified = false;
+  MachineInstr *WaitcntInstr = nullptr;
+  MachineInstr *WaitcntVsCntInstr = nullptr;
+
+  for (auto &II :
+       make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
+    if (II.isMetaInstruction())
+      continue;
+
+    unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
+    bool IsSoft = Opcode != II.getOpcode();
+
+    // Update required wait count. If this is a soft waitcnt (= it was added
+    // by an earlier pass), it may be entirely removed.
+    if (Opcode == AMDGPU::S_WAITCNT) {
+      unsigned IEnc = II.getOperand(0).getImm();
+      AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
+      if (IsSoft)
+        ScoreBrackets.simplifyWaitcnt(OldWait);
+      Wait = Wait.combined(OldWait);
+
+      // Merge consecutive waitcnt of the same type by erasing multiples.
+      if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) {
+        II.eraseFromParent();
+        Modified = true;
+      } else
+        WaitcntInstr = &II;
+    } else {
+      assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
+      assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+
+      unsigned OldVSCnt =
+          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+      if (IsSoft)
+        ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
+      Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
+
+      if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) {
+        II.eraseFromParent();
+        Modified = true;
+      } else
+        WaitcntVsCntInstr = &II;
+    }
+  }
+
+  if (WaitcntInstr) {
+    Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
+                                         AMDGPU::encodeWaitcnt(IV, Wait));
+    Modified |= promoteSoftWaitCnt(WaitcntInstr);
+
+    ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
+    ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
+    ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
+    Wait.LoadCnt = ~0u;
+    Wait.ExpCnt = ~0u;
+    Wait.DsCnt = ~0u;
+
+    LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
+                   ? dbgs()
+                         << "applyPreexistingWaitcnt\n"
+                         << "New Instr at block end: " << *WaitcntInstr << '\n'
+                   : dbgs() << "applyPreexistingWaitcnt\n"
+                            << "Old Instr: " << *It
+                            << "New Instr: " << *WaitcntInstr << '\n');
+  }
+
+  if (WaitcntVsCntInstr) {
+    Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
+                                         AMDGPU::OpName::simm16, Wait.StoreCnt);
+    Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
+
+    ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
+    Wait.StoreCnt = ~0u;
+
+    LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
+                   ? dbgs() << "applyPreexistingWaitcnt\n"
+                            << "New Instr at block end: " << *WaitcntVsCntInstr
+                            << '\n'
+                   : dbgs() << "applyPreexistingWaitcnt\n"
+                            << "Old Instr: " << *It
+                            << "New Instr: " << *WaitcntVsCntInstr << '\n');
+  }
+
+  return Modified;
+}
+
+/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
+/// required counters in \p Wait
+bool WaitCntGeneratorPreGFX12::createNewWaitcnt(
+    MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
+    AMDGPU::Waitcnt Wait) {
+  assert(ST);
+  assert(isNormalMode(MaxCounter));
+
+  bool Modified = false;
+  const DebugLoc &DL = Block.findDebugLoc(It);
+
+  // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
+  // single instruction while VScnt has its own instruction.
+  if (Wait.hasWaitExceptStoreCnt()) {
+    unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+    [[maybe_unused]] auto SWaitInst =
+        BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+    Modified = true;
+
+    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+               dbgs() << "New Instr: " << *SWaitInst << '\n');
+  }
+
+  if (Wait.hasWaitStoreCnt()) {
+    assert(ST->hasVscnt());
+
+    [[maybe_unused]] auto SWaitInst =
+        BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+            .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+            .addImm(Wait.StoreCnt);
+    Modified = true;
+
+    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+               dbgs() << "New Instr: " << *SWaitInst << '\n');
+  }
+
+  return Modified;
+}
+
+//===----------------------------------------------------------------------===//
+// WaitCntGeneratorGFX12Plus member functions.
+//===----------------------------------------------------------------------===//
+
+AMDGPU::Waitcnt
+WaitCntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
+  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
+}
+
+/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
+/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
+/// were added by previous passes. Currently this pass conservatively
+/// assumes that these preexisting waits are required for correctness.
+bool WaitCntGeneratorGFX12Plus::applyPreexistingWaitcnt(
+    WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
+    AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
+  assert(ST);
+  assert(!isNormalMode(MaxCounter));
+
+  bool Modified = false;
+  MachineInstr *CombinedLoadDsCntInstr = nullptr;
+  MachineInstr *CombinedStoreDsCntInstr = nullptr;
+  MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
+
+  for (auto &II :
+       make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
+    if (II.isMetaInstruction())
+      continue;
+
+    MachineInstr **UpdatableInstr;
+
+    // Update required wait count. If this is a soft waitcnt (= it was added
+    // by an earlier pass), it may be entirely removed.
+
+    unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
+    bool IsSoft = Opcode != II.getOpcode();
+
+    if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
+      unsigned OldEnc =
+          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+      AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
+      if (IsSoft)
+        ScoreBrackets.simplifyWaitcnt(OldWait);
+      Wait = Wait.combined(OldWait);
+      UpdatableInstr = &CombinedLoadDsCntInstr;
+    } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
+      unsigned OldEnc =
+          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+      AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
+      if (IsSoft)
+        ScoreBrackets.simplifyWaitcnt(OldWait);
+      Wait = Wait.combined(OldWait);
+      UpdatableInstr = &CombinedStoreDsCntInstr;
+    } else {
+      std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
+      assert(CT.has_value());
+      unsigned OldCnt =
+          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+      if (IsSoft)
+        ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
+      addWait(Wait, CT.value(), OldCnt);
+      UpdatableInstr = &WaitInstrs[CT.value()];
+    }
+
+    // Merge consecutive waitcnt of the same type by erasing multiples.
+    if (!*UpdatableInstr) {
+      *UpdatableInstr = &II;
+    } else {
+      II.eraseFromParent();
+      Modified = true;
+    }
+  }
+
+  if (CombinedLoadDsCntInstr) {
+    // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
+    // to be waited for. Otherwise, let the instruction be deleted so
+    // the appropriate single counter wait instruction can be inserted
+    // instead, when new S_WAIT_*CNT instructions are inserted by
+    // createNewWaitcnt(). As a side effect, resetting the wait counts will
+    // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
+    // the loop below that deals with single counter instructions.
+    if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
+      unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
+      Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
+                                           AMDGPU::OpName::simm16, NewEnc);
+      Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
+      ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
+      ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
+      Wait.LoadCnt = ~0u;
+      Wait.DsCnt = ~0u;
+
+      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+                     ? dbgs() << "applyPreexistingWaitcnt\n"
+                              << "New Instr at block end: "
+                              << *CombinedLoadDsCntInstr << '\n'
+                     : dbgs() << "applyPreexistingWaitcnt\n"
+                              << "Old Instr: " << *It << "New Instr: "
+                              << *CombinedLoadDsCntInstr << '\n');
+    } else {
+      CombinedLoadDsCntInstr->eraseFromParent();
+      Modified = true;
+    }
+  }
+
+  if (CombinedStoreDsCntInstr) {
+    // Similarly for S_WAIT_STORECNT_DSCNT.
+    if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
+      unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
+      Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
+                                           AMDGPU::OpName::simm16, NewEnc);
+      Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
+      ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
+      ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
+      Wait.StoreCnt = ~0u;
+      Wait.DsCnt = ~0u;
+
+      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+                     ? dbgs() << "applyPreexistingWaitcnt\n"
+                              << "New Instr at block end: "
+                              << *CombinedStoreDsCntInstr << '\n'
+                     : dbgs() << "applyPreexistingWaitcnt\n"
+                              << "Old Instr: " << *It << "New Instr: "
+                              << *CombinedStoreDsCntInstr << '\n');
+    } else {
+      CombinedStoreDsCntInstr->eraseFromParent();
+      Modified = true;
+    }
+  }
+
+  // Look for an opportunity to convert existing S_WAIT_LOADCNT,
+  // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
+  // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
+  // instructions so that createNewWaitcnt() will create new combined
+  // instructions to replace them.
+
+  if (Wait.DsCnt != ~0u) {
+    // This is a vector of addresses in WaitInstrs pointing to instructions
+    // that should be removed if they are present.
+    SmallVector<MachineInstr **, 2> WaitsToErase;
+
+    // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
+    // both) need to be waited for, ensure that there are no existing
+    // individual wait count instructions for these.
+
+    if (Wait.LoadCnt != ~0u) {
+      WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
+      WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
+    } else if (Wait.StoreCnt != ~0u) {
+      WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
+      WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
+    }
+
+    for (MachineInstr **WI : WaitsToErase) {
+      if (!*WI)
+        continue;
+
+      (*WI)->eraseFromParent();
+      *WI = nullptr;
+      Modified = true;
+    }
+  }
+
+  for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+    if (!WaitInstrs[CT])
+      continue;
+
+    unsigned NewCnt = getWait(Wait, CT);
+    if (NewCnt != ~0u) {
+      Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
+                                           AMDGPU::OpName::simm16, NewCnt);
+      Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
+
+      ScoreBrackets.applyWaitcnt(CT, NewCnt);
+      setNoWait(Wait, CT);
+
+      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+                     ? dbgs() << "applyPreexistingWaitcnt\n"
+                              << "New Instr at block end: " << *WaitInstrs[CT]
+                              << '\n'
+                     : dbgs() << "applyPreexistingWaitcnt\n"
+                              << "Old Instr: " << *It
+                              << "New Instr: " << *WaitInstrs[CT] << '\n');
+    } else {
+      WaitInstrs[CT]->eraseFromParent();
+      Modified = true;
+    }
+  }
+
+  return Modified;
+}
+
+/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
+bool WaitCntGeneratorGFX12Plus::createNewWaitcnt(
+    MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
+    AMDGPU::Waitcnt Wait) {
+  assert(ST);
+  assert(!isNormalMode(MaxCounter));
+
+  bool Modified = false;
+  const DebugLoc &DL = Block.findDebugLoc(It);
+
+  // Check for opportunities to use combined wait instructions.
+  if (Wait.DsCnt != ~0u) {
+    MachineInstr *SWaitInst = nullptr;
+
+    if (Wait.LoadCnt != ~0u) {
+      unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
+
+      SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
+                      .addImm(Enc);
+
+      Wait.LoadCnt = ~0u;
+      Wait.DsCnt = ~0u;
+    } else if (Wait.StoreCnt != ~0u) {
+      unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
+
+      SWaitInst =
+          BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
+              .addImm(Enc);
+
+      Wait.StoreCnt = ~0u;
+      Wait.DsCnt = ~0u;
+    }
+
+    if (SWaitInst) {
+      Modified = true;
+
+      LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+                 dbgs() << "New Instr: " << *SWaitInst << '\n');
+    }
+  }
+
+  // Generate an instruction for any remaining counter that needs
+  // waiting for.
+
+  for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+    unsigned Count = getWait(Wait, CT);
+    if (Count == ~0u)
+      continue;
+
+    [[maybe_unused]] auto SWaitInst =
+        BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
+            .addImm(Count);
+
+    Modified = true;
+
+    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+               dbgs() << "New Instr: " << *SWaitInst << '\n');
+  }
+
+  return Modified;
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPUWaitCntInserter member functions.
+//===----------------------------------------------------------------------===//
+
+// This is a flat memory operation. Check to see if it has memory tokens other
+// than LDS. Other address spaces supported by flat memory operations involve
+// global memory.
+bool AMDGPUWaitCntInserter::mayAccessVMEMThroughFlat(
+    const MachineInstr &MI) const {
+  assert(TII->isFLAT(MI));
+
+  // All flat instructions use the VMEM counter.
+  assert(TII->usesVM_CNT(MI));
+
+  // If there are no memory operands then conservatively assume the flat
+  // operation may access VMEM.
+  if (MI.memoperands_empty())
+    return true;
+
+  // See if any memory operand specifies an address space that involves VMEM.
+  // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
+  // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
+  // (GDS) address space is not supported by flat operations. Therefore, simply
+  // return true unless only the LDS address space is found.
+  for (const MachineMemOperand *Memop : MI.memoperands()) {
+    unsigned AS = Memop->getAddrSpace();
+    assert(AS != AMDGPUAS::REGION_ADDRESS);
+    if (AS != AMDGPUAS::LOCAL_ADDRESS)
+      return true;
+  }
+
+  return false;
+}
+
+// This is a flat memory operation. Check to see if it has memory tokens for
+// either scratch or FLAT.
+bool AMDGPUWaitCntInserter::mayAccessScratchThroughFlat(
+    const MachineInstr &MI) const {
+  assert(TII->isFLAT(MI));
+
+  // SCRATCH instructions always access scratch.
+  if (TII->isFLATScratch(MI))
+    return true;
+
+  // GLOBAL instructions never access scratch.
+  if (TII->isFLATGlobal(MI))
+    return false;
+
+  // If there are no memory operands then conservatively assume the flat
+  // operation may access scratch.
+  if (MI.memoperands_empty())
+    return true;
+
+  // See if any memory operand specifies an address space that involves scratch.
+  return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
+    unsigned AS = Memop->getAddrSpace();
+    return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
+  });
+}
+
+bool AMDGPUWaitCntInserter::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
+  return SIInstrInfo::isVMEM(MI) ||
+         (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
+}
+
+bool AMDGPUWaitCntInserter::generateWaitcnt(
+    AMDGPU::Waitcnt Wait, MachineBasicBlock::instr_iterator It,
+    MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
+    MachineInstr *OldWaitcntInstr) {
+  bool Modified = false;
+
+  if (OldWaitcntInstr)
+    // Try to merge the required wait with preexisting waitcnt instructions.
+    // Also erase redundant waitcnt.
+    Modified =
+        WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
+
+  // Any counts that could have been applied to any existing waitcnt
+  // instructions will have been done so, now deal with any remaining.
+  ScoreBrackets.applyWaitcnt(Wait);
+
+  // ExpCnt can be merged into VINTERP.
+  if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
+      SIInstrInfo::isVINTERP(*It)) {
+    MachineOperand *WaitExp =
+        TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
+    if (Wait.ExpCnt < WaitExp->getImm()) {
+      WaitExp->setImm(Wait.ExpCnt);
+      Modified = true;
+    }
+    Wait.ExpCnt = ~0u;
+
+    LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
+                      << "Update Instr: " << *It);
+  }
+
+  if (WCG->createNewWaitcnt(Block, It, Wait))
+    Modified = true;
+
+  return Modified;
+}
+
+// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the
+// end of the given block if needed.
+bool AMDGPUWaitCntInserter::generateWaitcntBlockEnd(
+    MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
+    MachineInstr *OldWaitcntInstr) {
+  AMDGPU::Waitcnt Wait;
+
+  unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT);
+  unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT);
+  unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT);
+
+  if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0)
+    return false;
+
+  if (LoadCntPending != 0)
+    Wait.LoadCnt = 0;
+  if (SampleCntPending != 0)
+    Wait.SampleCnt = 0;
+  if (BvhCntPending != 0)
+    Wait.BvhCnt = 0;
+
+  return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
+                         OldWaitcntInstr);
+}
+
+bool AMDGPUWaitCntInserter::insertWaitCntsInFunction(MachineFunction &MF,
+                                                     VGPRInstsSet *VGPRInsts) {
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
+  InstCounterType SmemAccessCounter =
+      eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
+
+  unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
+  unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
+  assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
+  assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
+
+  RegisterEncoding Encoding = {};
+  Encoding.VGPR0 =
+      TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
+  Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
+  Encoding.SGPR0 =
+      TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
+  Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
+
+  MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
+  BlockInfos.clear();
+  bool Modified = false;
+
+  MachineBasicBlock &EntryBB = MF.front();
+  MachineBasicBlock::iterator I = EntryBB.begin();
+
+  if (!MFI->isEntryFunction()) {
+    // Wait for any outstanding memory operations that the input registers may
+    // depend on. We can't track them and it's better to do the wait after the
+    // costly call sequence.
+
+    // TODO: Could insert earlier and schedule more liberally with operations
+    // that only use caller preserved registers.
+    for (MachineBasicBlock::iterator E = EntryBB.end();
+         I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
+      ;
+
+    if (ST->hasExtendedWaitCounts()) {
+      BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
+          .addImm(0);
+      for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+        if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
+          continue;
+
+        BuildMI(EntryBB, I, DebugLoc(),
+                TII->get(instrsForExtendedCounterTypes[CT]))
+            .addImm(0);
+      }
+    } else {
+      BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
+    }
+
+    auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
+        ST, MaxCounter, Encoding, WaitEventMaskForInst, SmemAccessCounter);
+    NonKernelInitialState->setStateOnFunctionEntryOrReturn();
+    BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
+
+    Modified = true;
+  }
+
+  // Keep iterating over the blocks in reverse post order, inserting and
+  // updating s_waitcnt where needed, until a fix point is reached.
+  for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
+    BlockInfos.insert({MBB, BlockInfo()});
+
+  std::unique_ptr<WaitcntBrackets> Brackets;
+  bool Repeat;
+  do {
+    Repeat = false;
+
+    for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
+         ++BII) {
+      MachineBasicBlock *MBB = BII->first;
+      BlockInfo &BI = BII->second;
+      if (!BI.Dirty)
+        continue;
+
+      if (BI.Incoming) {
+        if (!Brackets)
+          Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
+        else
+          *Brackets = *BI.Incoming;
+      } else {
+        if (!Brackets)
+          Brackets = std::make_unique<WaitcntBrackets>(ST, MaxCounter, Encoding,
+                                                       WaitEventMaskForInst,
+                                                       SmemAccessCounter);
+        else
+          *Brackets = WaitcntBrackets(ST, MaxCounter, Encoding,
+                                      WaitEventMaskForInst, SmemAccessCounter);
+      }
+
+      Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets, VGPRInsts);
+      BI.Dirty = false;
+
+      if (Brackets->hasPendingEvent()) {
+        BlockInfo *MoveBracketsToSucc = nullptr;
+        for (MachineBasicBlock *Succ : MBB->successors()) {
+          auto SuccBII = BlockInfos.find(Succ);
+          BlockInfo &SuccBI = SuccBII->second;
+          if (!SuccBI.Incoming) {
+            SuccBI.Dirty = true;
+            if (SuccBII <= BII)
+              Repeat = true;
+            if (!MoveBracketsToSucc) {
+              MoveBracketsToSucc = &SuccBI;
+            } else {
+              SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
+            }
+          } else if (SuccBI.Incoming->merge(*Brackets)) {
+            SuccBI.Dirty = true;
+            if (SuccBII <= BII)
+              Repeat = true;
+          }
+        }
+        if (MoveBracketsToSucc)
+          MoveBracketsToSucc->Incoming = std::move(Brackets);
+      }
+    }
+  } while (Repeat);
+
+  if (ST->hasScalarStores()) {
+    SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
+    bool HaveScalarStores = false;
+
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
+        if (!HaveScalarStores && TII->isScalarStore(MI))
+          HaveScalarStores = true;
+
+        if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
+            MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
+          EndPgmBlocks.push_back(&MBB);
+      }
+    }
+
+    if (HaveScalarStores) {
+      // If scalar writes are used, the cache must be flushed or else the next
+      // wave to reuse the same scratch memory can be clobbered.
+      //
+      // Insert s_dcache_wb at wave termination points if there were any scalar
+      // stores, and only if the cache hasn't already been flushed. This could
+      // be improved by looking across blocks for flushes in postdominating
+      // blocks from the stores but an explicitly requested flush is probably
+      // very rare.
+      for (MachineBasicBlock *MBB : EndPgmBlocks) {
+        bool SeenDCacheWB = false;
+
+        for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+             I != E; ++I) {
+          if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
+            SeenDCacheWB = true;
+          else if (TII->isScalarStore(*I))
+            SeenDCacheWB = false;
+
+          // FIXME: It would be better to insert this before a waitcnt if any.
+          if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
+               I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
+              !SeenDCacheWB) {
+            Modified = true;
+            BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+          }
+        }
+      }
+    }
+  }
+
+  // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
+  // instructions.
+  for (MachineInstr *MI : *VGPRInsts) {
+    if (ST->requiresNopBeforeDeallocVGPRs()) {
+      BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_NOP))
+          .addImm(0);
+    }
+    BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_SENDMSG))
+        .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
+    Modified = true;
+  }
+  VGPRInsts->clear();
+
+  return Modified;
+}
+
+} // namespace AMDGPU
+
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUWaitCountUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUWaitCountUtils.h
new file mode 100644
index 00000000000000..b5f74c932672ba
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUWaitCountUtils.h
@@ -0,0 +1,531 @@
+//===- AMDGPUWaitCountUtils.h - Wait count insertion interface -*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUWAITCOUNTUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUWAITCOUNTUTILS_H
+
+#include "GCNSubtarget.h"
+
+namespace llvm {
+
+namespace AMDGPU {
+
+// Class of object that encapsulates latest instruction counter score
+// associated with the operand.  Used for determining whether
+// s_waitcnt instruction needs to be emitted.
+enum InstCounterType : uint8_t {
+  CT_START = 0,
+  LOAD_CNT = CT_START, // VMcnt prior to gfx12.
+  DS_CNT,              // LKGMcnt prior to gfx12.
+  EXP_CNT,             //
+  STORE_CNT,           // VScnt in gfx10/gfx11.
+  NUM_NORMAL_INST_CNTS,
+  SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
+  BVH_CNT,                           // gfx12+ only.
+  KM_CNT,                            // gfx12+ only.
+  NUM_EXTENDED_INST_CNTS,
+  NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
+};
+
+} // namespace AMDGPU
+
+using AMDGPU::InstCounterType;
+
+template <> struct enum_iteration_traits<InstCounterType> {
+  static constexpr bool is_iterable = true;
+};
+
+namespace AMDGPU {
+
+// Return an iterator over all counters between the first counter and \c
+// MaxCounter (exclusive, default value yields an enumeration over all
+// counters).
+inline auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
+  return enum_seq(CT_START, MaxCounter);
+}
+
+enum WaitEventType : uint8_t {
+  VMEM_ACCESS,              // vector-memory read & write
+  VMEM_READ_ACCESS,         // vector-memory read
+  VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
+  VMEM_BVH_READ_ACCESS,     // vector-memory BVH read (gfx12+ only)
+  VMEM_WRITE_ACCESS,        // vector-memory write that is not scratch
+  SCRATCH_WRITE_ACCESS,     // vector-memory write that may be scratch
+  LDS_ACCESS,               // lds read & write
+  GDS_ACCESS,               // gds read & write
+  SQ_MESSAGE,               // send message
+  SMEM_ACCESS,              // scalar-memory read & write
+  EXP_GPR_LOCK,             // export holding on its data src
+  GDS_GPR_LOCK,             // GDS holding on its data and addr src
+  EXP_POS_ACCESS,           // write to export position
+  EXP_PARAM_ACCESS,         // write to export parameter
+  VMW_GPR_LOCK,             // vector-memory write holding on its data src
+  EXP_LDS_ACCESS,           // read by ldsdir counting as export
+  NUM_WAIT_EVENTS
+};
+using AMDGPU::WaitEventType;
+
+using RegInterval = std::pair<int, int>;
+
+struct RegisterEncoding {
+  unsigned VGPR0;
+  unsigned VGPRL;
+  unsigned SGPR0;
+  unsigned SGPRL;
+};
+
+// The mapping is:
+//  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
+//  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
+//  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
+// We reserve a fixed number of VGPR slots in the scoring tables for
+// special tokens like SCMEM_LDS (needed for buffer load to LDS).
+enum RegisterMapping {
+  SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
+  AGPR_OFFSET = 256,      // Maximum programmable AccVGPRs across all targets.
+  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
+  NUM_EXTRA_VGPRS = 9,    // Reserved slot for DS.
+  // Artificial register slots to track LDS writes into specific LDS locations
+  // if a location is known. When slots are exhausted or location is
+  // unknown use the first slot. The first slot is also always updated in
+  // addition to known location's slot to properly generate waits if dependent
+  // instruction's location is unknown.
+  EXTRA_VGPR_LDS = 0,
+  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
+};
+
+// Enumerate different types of result-returning VMEM operations. Although
+// s_waitcnt orders them all with a single vmcnt counter, in the absence of
+// s_waitcnt only instructions of the same VmemType are guaranteed to write
+// their results in order -- so there is no need to insert an s_waitcnt between
+// two instructions of the same type that write the same vgpr.
+enum VmemType {
+  // BUF instructions and MIMG instructions without a sampler.
+  VMEM_NOSAMPLER,
+  // MIMG instructions with a sampler.
+  VMEM_SAMPLER,
+  // BVH instructions
+  VMEM_BVH,
+  NUM_VMEM_TYPES
+};
+
+// Maps values of InstCounterType to the instruction that waits on that
+// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
+// returns true.
+static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
+    AMDGPU::S_WAIT_LOADCNT,  AMDGPU::S_WAIT_DSCNT,     AMDGPU::S_WAIT_EXPCNT,
+    AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
+    AMDGPU::S_WAIT_KMCNT};
+
+using WaitCntBitMaskFn = std::function<unsigned(const IsaVersion &)>;
+// This objects maintains the current score brackets of each wait counter, and
+// a per-register scoreboard for each wait counter.
+//
+// We also maintain the latest score for every event type that can change the
+// waitcnt in order to know if there are multiple types of events within
+// the brackets. When multiple types of event happen in the bracket,
+// wait count may get decreased out of order, therefore we need to put in
+// "s_waitcnt 0" before use.
+class WaitcntBrackets {
+public:
+  WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
+                  RegisterEncoding Encoding,
+                  const unsigned *WaitEventMaskForInst,
+                  InstCounterType SmemAccessCounter)
+      : ST(SubTarget), MaxCounter(MaxCounter), Encoding(Encoding),
+        WaitEventMaskForInst(WaitEventMaskForInst),
+        SmemAccessCounter(SmemAccessCounter) {
+    AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
+    for (auto T : inst_counter_types()) {
+      auto Fn = getWaitCntBitMaskFn(T);
+      HardwareLimits[T] = Fn(IV);
+    }
+  }
+
+  unsigned getWaitCountMax(InstCounterType T) const {
+    return HardwareLimits[T];
+  }
+
+  bool isSmemCounter(InstCounterType T) const { return T == SmemAccessCounter; }
+
+  unsigned getScoreLB(InstCounterType T) const {
+    assert(T < NUM_INST_CNTS);
+    return ScoreLBs[T];
+  }
+
+  unsigned getScoreUB(InstCounterType T) const {
+    assert(T < NUM_INST_CNTS);
+    return ScoreUBs[T];
+  }
+
+  unsigned getScoreRange(InstCounterType T) const {
+    return getScoreUB(T) - getScoreLB(T);
+  }
+
+  unsigned getRegScore(int GprNo, InstCounterType T) const {
+    if (GprNo < NUM_ALL_VGPRS) {
+      return VgprScores[T][GprNo];
+    }
+    assert(isSmemCounter(T));
+    return SgprScores[GprNo - NUM_ALL_VGPRS];
+  }
+
+  bool merge(const WaitcntBrackets &Other);
+
+  RegInterval getRegInterval(const MachineInstr *MI,
+                             const MachineRegisterInfo *MRI,
+                             const SIRegisterInfo *TRI, unsigned OpNo) const;
+
+  bool counterOutOfOrder(InstCounterType T) const;
+  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
+  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+  void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
+  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
+  void applyWaitcnt(InstCounterType T, unsigned Count);
+  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
+                     const MachineRegisterInfo *MRI, WaitEventType E,
+                     MachineInstr &MI);
+
+  unsigned hasPendingEvent() const { return PendingEvents; }
+  unsigned hasPendingEvent(WaitEventType E) const {
+    return PendingEvents & (1 << E);
+  }
+  unsigned hasPendingEvent(InstCounterType T) const {
+    unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
+    assert((HasPending != 0) == (getScoreRange(T) != 0));
+    return HasPending;
+  }
+
+  bool hasMixedPendingEvents(InstCounterType T) const {
+    unsigned Events = hasPendingEvent(T);
+    // Return true if more than one bit is set in Events.
+    return Events & (Events - 1);
+  }
+
+  bool hasPendingFlat() const {
+    return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
+             LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
+            (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
+             LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
+  }
+
+  void setPendingFlat() {
+    LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
+    LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
+  }
+
+  // Return true if there might be pending writes to the specified vgpr by VMEM
+  // instructions with types different from V.
+  bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
+    assert(GprNo < NUM_ALL_VGPRS);
+    return VgprVmemTypes[GprNo] & ~(1 << V);
+  }
+
+  void clearVgprVmemTypes(int GprNo) {
+    assert(GprNo < NUM_ALL_VGPRS);
+    VgprVmemTypes[GprNo] = 0;
+  }
+
+  void setStateOnFunctionEntryOrReturn() {
+    setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
+    PendingEvents |= WaitEventMaskForInst[STORE_CNT];
+  }
+
+  ArrayRef<const MachineInstr *> getLDSDMAStores() const {
+    return LDSDMAStores;
+  }
+
+  void print(raw_ostream &);
+  void dump() { print(dbgs()); }
+
+private:
+  struct MergeInfo {
+    unsigned OldLB;
+    unsigned OtherLB;
+    unsigned MyShift;
+    unsigned OtherShift;
+  };
+
+  WaitCntBitMaskFn getWaitCntBitMaskFn(InstCounterType T);
+  static bool mergeScore(const MergeInfo &M, unsigned &Score,
+                         unsigned OtherScore);
+
+  void setScoreLB(InstCounterType T, unsigned Val) {
+    assert(T < NUM_INST_CNTS);
+    ScoreLBs[T] = Val;
+  }
+
+  void setScoreUB(InstCounterType T, unsigned Val) {
+    assert(T < NUM_INST_CNTS);
+    ScoreUBs[T] = Val;
+
+    if (T != EXP_CNT)
+      return;
+
+    if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
+      ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
+  }
+
+  void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
+    if (GprNo < NUM_ALL_VGPRS) {
+      VgprUB = std::max(VgprUB, GprNo);
+      VgprScores[T][GprNo] = Val;
+    } else {
+      assert(isSmemCounter(T));
+      SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
+      SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
+    }
+  }
+
+  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
+                   const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
+                   unsigned OpNo, unsigned Val);
+
+  const GCNSubtarget *ST = nullptr;
+  InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
+  unsigned HardwareLimits[NUM_INST_CNTS] = {0};
+  RegisterEncoding Encoding = {};
+  const unsigned *WaitEventMaskForInst;
+  InstCounterType SmemAccessCounter;
+  unsigned ScoreLBs[NUM_INST_CNTS] = {0};
+  unsigned ScoreUBs[NUM_INST_CNTS] = {0};
+  unsigned PendingEvents = 0;
+  // Remember the last flat memory operation.
+  unsigned LastFlat[NUM_INST_CNTS] = {0};
+  // wait_cnt scores for every vgpr.
+  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
+  int VgprUB = -1;
+  int SgprUB = -1;
+  unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
+  // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
+  // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
+  unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+  // Bitmask of the VmemTypes of VMEM instructions that might have a pending
+  // write to each vgpr.
+  unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
+  // Store representative LDS DMA operations. The only useful info here is
+  // alias info. One store is kept per unique AAInfo.
+  SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
+};
+
+struct BlockInfo {
+  std::unique_ptr<WaitcntBrackets> Incoming;
+  bool Dirty = true;
+};
+
+// This abstracts the logic for generating and updating S_WAIT* instructions
+// away from the analysis that determines where they are needed. This was
+// done because the set of counters and instructions for waiting on them
+// underwent a major shift with gfx12, sufficiently so that having this
+// abstraction allows the main analysis logic to be simpler than it would
+// otherwise have had to become.
+class WaitCntGenerator {
+protected:
+  const GCNSubtarget *ST = nullptr;
+  const SIInstrInfo *TII = nullptr;
+  AMDGPU::IsaVersion IV;
+  InstCounterType MaxCounter;
+
+public:
+  WaitCntGenerator() {}
+  WaitCntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
+      : ST(ST), TII(ST->getInstrInfo()),
+        IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {}
+
+  // Edits an existing sequence of wait count instructions according
+  // to an incoming Waitcnt value, which is itself updated to reflect
+  // any new wait count instructions which may need to be generated by
+  // WaitCntGenerator::createNewWaitcnt(). It will return true if any edits
+  // were made.
+  //
+  // This editing will usually be merely updated operands, but it may also
+  // delete instructions if the incoming Wait value indicates they are not
+  // needed. It may also remove existing instructions for which a wait
+  // is needed if it can be determined that it is better to generate new
+  // instructions later, as can happen on gfx12.
+  virtual bool
+  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+                          MachineBasicBlock::instr_iterator It) const = 0;
+
+  // Generates new wait count instructions according to the  value of
+  // Wait, returning true if any new instructions were created.
+  virtual bool createNewWaitcnt(MachineBasicBlock &Block,
+                                MachineBasicBlock::instr_iterator It,
+                                AMDGPU::Waitcnt Wait) = 0;
+
+  // Returns an array of bit masks which can be used to map values in
+  // WaitEventType to corresponding counter values in InstCounterType.
+  virtual const unsigned *getWaitEventMask() const = 0;
+
+  // Returns a new waitcnt with all counters except VScnt set to 0. If
+  // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
+  virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
+
+  virtual ~WaitCntGenerator() = default;
+
+  // Transform a soft waitcnt into a normal one.
+  bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
+    unsigned Opcode =
+        SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
+    if (Opcode == Waitcnt->getOpcode())
+      return false;
+
+    Waitcnt->setDesc(TII->get(Opcode));
+    return true;
+  }
+
+  // Create a mask value from the initializer list of wait event types.
+  unsigned eventMask(std::initializer_list<WaitEventType> Events) const {
+    unsigned Mask = 0;
+    for (auto &E : Events)
+      Mask |= 1 << E;
+
+    return Mask;
+  }
+};
+
+class WaitCntGeneratorPreGFX12 : public WaitCntGenerator {
+public:
+  WaitCntGeneratorPreGFX12() {}
+  WaitCntGeneratorPreGFX12(const GCNSubtarget *ST)
+      : WaitCntGenerator(ST, NUM_NORMAL_INST_CNTS) {}
+
+  virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
+  bool
+  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+                          MachineBasicBlock::instr_iterator It) const override;
+  bool createNewWaitcnt(MachineBasicBlock &Block,
+                        MachineBasicBlock::instr_iterator It,
+                        AMDGPU::Waitcnt Wait) override;
+
+  const unsigned *getWaitEventMask() const override {
+    assert(ST);
+
+    static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
+        eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
+                   VMEM_BVH_READ_ACCESS}),
+        eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
+        eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
+                   EXP_POS_ACCESS, EXP_LDS_ACCESS}),
+        eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
+        0,
+        0,
+        0};
+
+    return WaitEventMaskForInstPreGFX12;
+  }
+};
+
+class WaitCntGeneratorGFX12Plus : public WaitCntGenerator {
+public:
+  WaitCntGeneratorGFX12Plus() {}
+  WaitCntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter)
+      : WaitCntGenerator(ST, MaxCounter) {}
+
+  virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
+  bool
+  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+                          MachineBasicBlock::instr_iterator It) const override;
+  bool createNewWaitcnt(MachineBasicBlock &Block,
+                        MachineBasicBlock::instr_iterator It,
+                        AMDGPU::Waitcnt Wait) override;
+
+  const unsigned *getWaitEventMask() const override {
+    assert(ST);
+
+    static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
+        eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
+        eventMask({LDS_ACCESS, GDS_ACCESS}),
+        eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
+                   EXP_POS_ACCESS, EXP_LDS_ACCESS}),
+        eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
+        eventMask({VMEM_SAMPLER_READ_ACCESS}),
+        eventMask({VMEM_BVH_READ_ACCESS}),
+        eventMask({SMEM_ACCESS, SQ_MESSAGE})};
+
+    return WaitEventMaskForInstGFX12Plus;
+  }
+};
+
+using VGPRInstsSet = DenseSet<MachineInstr *>;
+
+/// This class provides the abstraction for the wait count insertions in a
+/// function. Virtual methods are provided to handle the waitcnt insertion in a
+/// baisc block for various memory operations as per subtarget requirements.
+class AMDGPUWaitCntInserter {
+public:
+  AMDGPUWaitCntInserter() {}
+  AMDGPUWaitCntInserter(const GCNSubtarget *ST, const MachineRegisterInfo *MRI,
+                        WaitCntGenerator *WCG, InstCounterType MC)
+      : ST(ST), TII(ST->getInstrInfo()), TRI(ST->getRegisterInfo()), MRI(MRI),
+        WCG(WCG), MaxCounter(MC) {}
+  virtual ~AMDGPUWaitCntInserter() = default;
+
+  InstCounterType getMaxCounter() const { return MaxCounter; }
+
+  bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+  bool generateWaitcnt(AMDGPU::Waitcnt Wait,
+                       MachineBasicBlock::instr_iterator It,
+                       MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
+                       MachineInstr *OldWaitcntInstr);
+  bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
+                               WaitcntBrackets &ScoreBrackets,
+                               MachineInstr *OldWaitcntInstr);
+  bool insertWaitCntsInFunction(MachineFunction &MF, VGPRInstsSet *VGPRInsts);
+
+  virtual bool generateWaitcntInstBefore(MachineInstr &MI,
+                                         WaitcntBrackets &ScoreBrackets,
+                                         MachineInstr *OldWaitcntInstr,
+                                         bool FlushVmCnt,
+                                         VGPRInstsSet *VGPRInsts) = 0;
+
+  virtual bool insertWaitcntInBlock(MachineFunction &MF,
+                                    MachineBasicBlock &Block,
+                                    WaitcntBrackets &ScoreBrackets,
+                                    VGPRInstsSet *VGPRInsts) = 0;
+
+  virtual void updateEventWaitcntAfter(MachineInstr &Inst,
+                                       WaitcntBrackets *ScoreBrackets) = 0;
+
+protected:
+  bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
+  bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
+
+  const GCNSubtarget *ST = nullptr;
+  const SIInstrInfo *TII = nullptr;
+  const SIRegisterInfo *TRI = nullptr;
+  const MachineRegisterInfo *MRI = nullptr;
+
+  // WCG will point to one of the generator objects of its derived classes,
+  // which must have been re-initialised before use from a value made using a
+  // subtarget constructor.
+  WaitCntGenerator *WCG = nullptr;
+  InstCounterType MaxCounter;
+};
+
+bool isWaitInstr(MachineInstr &Inst);
+VmemType getVmemType(const MachineInstr &Inst);
+bool callWaitsOnFunctionEntry(const MachineInstr &MI);
+bool callWaitsOnFunctionReturn(const MachineInstr &MI);
+InstCounterType eventCounter(const unsigned *masks, WaitEventType E);
+bool readsVCCZ(const MachineInstr &MI);
+bool isCacheInvOrWBInst(MachineInstr &Inst);
+bool updateVMCntOnly(const MachineInstr &Inst);
+void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count);
+WaitCntGenerator *getWaitCntGenerator(MachineFunction &MF,
+                                      WaitCntGeneratorPreGFX12 &WCGPreGFX12,
+                                      WaitCntGeneratorGFX12Plus &WCGGFX12Plus,
+                                      InstCounterType &MaxCounter);
+} // end namespace AMDGPU
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUWAITCOUNTUTILS_H
diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
index 19d3b690b1315d..c0b6b293f79e43 100644
--- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_component_library(LLVMAMDGPUUtils
   AMDGPUBaseInfo.cpp
   AMDGPUMemoryUtils.cpp
   AMDGPUPALMetadata.cpp
+  AMDGPUWaitCountUtils.cpp
   AMDKernelCodeTUtils.cpp
 
   LINK_COMPONENTS
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 48f00a82e3e1c6..90d9fd71f6e2cc 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -130,7 +130,7 @@
 ; GCN-O0-NEXT:        MachineDominator Tree Construction
 ; GCN-O0-NEXT:        Machine Natural Loop Construction
 ; GCN-O0-NEXT:        MachinePostDominator Tree Construction
-; GCN-O0-NEXT:        SI insert wait instructions
+; GCN-O0-NEXT:        SI Insert Wait Instructions
 ; GCN-O0-NEXT:        Insert required mode register values
 ; GCN-O0-NEXT:        SI Final Branch Preparation
 ; GCN-O0-NEXT:        Post RA hazard recognizer
@@ -396,7 +396,7 @@
 ; GCN-O1-NEXT:        MachineDominator Tree Construction
 ; GCN-O1-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-NEXT:        MachinePostDominator Tree Construction
-; GCN-O1-NEXT:        SI insert wait instructions
+; GCN-O1-NEXT:        SI Insert Wait Instructions
 ; GCN-O1-NEXT:        Insert required mode register values
 ; GCN-O1-NEXT:        SI Insert Hard Clauses
 ; GCN-O1-NEXT:        SI Final Branch Preparation
@@ -693,7 +693,7 @@
 ; GCN-O1-OPTS-NEXT:        MachineDominator Tree Construction
 ; GCN-O1-OPTS-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-OPTS-NEXT:        MachinePostDominator Tree Construction
-; GCN-O1-OPTS-NEXT:        SI insert wait instructions
+; GCN-O1-OPTS-NEXT:        SI Insert Wait Instructions
 ; GCN-O1-OPTS-NEXT:        Insert required mode register values
 ; GCN-O1-OPTS-NEXT:        SI Insert Hard Clauses
 ; GCN-O1-OPTS-NEXT:        SI Final Branch Preparation
@@ -996,7 +996,7 @@
 ; GCN-O2-NEXT:        MachineDominator Tree Construction
 ; GCN-O2-NEXT:        Machine Natural Loop Construction
 ; GCN-O2-NEXT:        MachinePostDominator Tree Construction
-; GCN-O2-NEXT:        SI insert wait instructions
+; GCN-O2-NEXT:        SI Insert Wait Instructions
 ; GCN-O2-NEXT:        Insert required mode register values
 ; GCN-O2-NEXT:        SI Insert Hard Clauses
 ; GCN-O2-NEXT:        SI Final Branch Preparation
@@ -1311,7 +1311,7 @@
 ; GCN-O3-NEXT:        MachineDominator Tree Construction
 ; GCN-O3-NEXT:        Machine Natural Loop Construction
 ; GCN-O3-NEXT:        MachinePostDominator Tree Construction
-; GCN-O3-NEXT:        SI insert wait instructions
+; GCN-O3-NEXT:        SI Insert Wait Instructions
 ; GCN-O3-NEXT:        Insert required mode register values
 ; GCN-O3-NEXT:        SI Insert Hard Clauses
 ; GCN-O3-NEXT:        SI Final Branch Preparation