[llvm] [NFC][llvm] Fix incomplete type issues in LLVM (PR #182655)

Fri Feb 20 23:45:21 PST 2026

https://github.com/2876225417 updated https://github.com/llvm/llvm-project/pull/182655

>From f55c2c0e60bbcbd3e79a929911a049846f6e1417 Mon Sep 17 00:00:00 2001
From: ppqwqqq <2876225417 at qq.com>
Date: Sat, 21 Feb 2026 13:04:40 +0800
Subject: [PATCH] [llvm] Fix incomplete type issues in LLVM

---
 llvm/include/llvm/ADT/STLExtras.h           |   3 +
 llvm/include/llvm/IR/CFG.h                  |   1 +
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 542 ++++++++++----------
 llvm/lib/Target/BPF/BPFAsmPrinter.cpp       |   4 +
 llvm/lib/Target/BPF/BPFAsmPrinter.h         |   3 +-
 5 files changed, 291 insertions(+), 262 deletions(-)

diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index 80c97e77724e9..c8276b17014e1 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -1221,6 +1221,9 @@ class indexed_accessor_range_base {
   class iterator : public indexed_accessor_iterator<iterator, BaseT, T,
                                                     PointerT, ReferenceT> {
   public:
+    iterator()
+        : indexed_accessor_iterator<iterator, BaseT, T, std::remove_cv_t<T>, T>(
+              nullptr, 0) {}
     // Index into this iterator, invoking a static method on the derived type.
     ReferenceT operator*() const {
       return DerivedT::dereference_iterator(this->getBase(), this->getIndex());
diff --git a/llvm/include/llvm/IR/CFG.h b/llvm/include/llvm/IR/CFG.h
index 96d3b2fbb5b0b..da1c1f645d25d 100644
--- a/llvm/include/llvm/IR/CFG.h
+++ b/llvm/include/llvm/IR/CFG.h
@@ -180,6 +180,7 @@ class SuccIterator
   };
 
 public:
+  SuccIterator() : Inst(nullptr), Idx(0) {}
   // begin iterator
   explicit inline SuccIterator(InstructionT *Inst) : Inst(Inst), Idx(0) {}
   // end iterator
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 12361da511f5a..32e32300acb6d 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -487,192 +487,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
   AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
 };
 
-// Flags indicating which counters should be flushed in a loop preheader.
-struct PreheaderFlushFlags {
-  bool FlushVmCnt = false;
-  bool FlushDsCnt = false;
-};
-
-class SIInsertWaitcnts {
-public:
-  const GCNSubtarget *ST;
-  const SIInstrInfo *TII = nullptr;
-  const SIRegisterInfo *TRI = nullptr;
-  const MachineRegisterInfo *MRI = nullptr;
-  InstCounterType SmemAccessCounter;
-  InstCounterType MaxCounter;
-  bool IsExpertMode = false;
-
-private:
-  DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
-  DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
-  MachineLoopInfo *MLI;
-  MachinePostDominatorTree *PDT;
-  AliasAnalysis *AA = nullptr;
-
-  struct BlockInfo {
-    std::unique_ptr<WaitcntBrackets> Incoming;
-    bool Dirty = true;
-  };
-
-  MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
-
-  bool ForceEmitWaitcnt[NUM_INST_CNTS];
-
-  std::unique_ptr<WaitcntGenerator> WCG;
-
-  // Remember call and return instructions in the function.
-  DenseSet<MachineInstr *> CallInsts;
-  DenseSet<MachineInstr *> ReturnInsts;
-
-  // Remember all S_ENDPGM instructions. The boolean flag is true if there might
-  // be outstanding stores but definitely no outstanding scratch stores, to help
-  // with insertion of DEALLOC_VGPRS messages.
-  DenseMap<MachineInstr *, bool> EndPgmInsts;
-
-  AMDGPU::HardwareLimits Limits;
-
-public:
-  SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
-                   AliasAnalysis *AA)
-      : MLI(MLI), PDT(PDT), AA(AA) {
-    (void)ForceExpCounter;
-    (void)ForceLgkmCounter;
-    (void)ForceVMCounter;
-  }
-
-  const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
-
-  PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
-                                             const WaitcntBrackets &Brackets);
-  PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
-                                         const WaitcntBrackets &ScoreBrackets);
-  bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
-  bool isDSRead(const MachineInstr &MI) const;
-  bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
-  bool run(MachineFunction &MF);
-
-  void setForceEmitWaitcnt() {
-// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
-// For debug builds, get the debug counter info and adjust if need be
-#ifndef NDEBUG
-    if (DebugCounter::isCounterSet(ForceExpCounter) &&
-        DebugCounter::shouldExecute(ForceExpCounter)) {
-      ForceEmitWaitcnt[EXP_CNT] = true;
-    } else {
-      ForceEmitWaitcnt[EXP_CNT] = false;
-    }
-
-    if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
-        DebugCounter::shouldExecute(ForceLgkmCounter)) {
-      ForceEmitWaitcnt[DS_CNT] = true;
-      ForceEmitWaitcnt[KM_CNT] = true;
-    } else {
-      ForceEmitWaitcnt[DS_CNT] = false;
-      ForceEmitWaitcnt[KM_CNT] = false;
-    }
-
-    if (DebugCounter::isCounterSet(ForceVMCounter) &&
-        DebugCounter::shouldExecute(ForceVMCounter)) {
-      ForceEmitWaitcnt[LOAD_CNT] = true;
-      ForceEmitWaitcnt[SAMPLE_CNT] = true;
-      ForceEmitWaitcnt[BVH_CNT] = true;
-    } else {
-      ForceEmitWaitcnt[LOAD_CNT] = false;
-      ForceEmitWaitcnt[SAMPLE_CNT] = false;
-      ForceEmitWaitcnt[BVH_CNT] = false;
-    }
-
-    ForceEmitWaitcnt[VA_VDST] = false;
-    ForceEmitWaitcnt[VM_VSRC] = false;
-#endif // NDEBUG
-  }
-
-  // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
-  // instruction.
-  WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
-    switch (Inst.getOpcode()) {
-    // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
-    case AMDGPU::GLOBAL_INV:
-      return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
-                                // VGPRs
-    case AMDGPU::GLOBAL_WB:
-    case AMDGPU::GLOBAL_WBINV:
-      return VMEM_WRITE_ACCESS; // tracked using storecnt
-    default:
-      break;
-    }
-
-    // Maps VMEM access types to their corresponding WaitEventType.
-    static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
-        VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
-
-    assert(SIInstrInfo::isVMEM(Inst));
-    // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
-    // these should use VM_CNT.
-    if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
-      return VMEM_ACCESS;
-    if (Inst.mayStore() &&
-        (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
-      if (TII->mayAccessScratch(Inst))
-        return SCRATCH_WRITE_ACCESS;
-      return VMEM_WRITE_ACCESS;
-    }
-    if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
-      return VMEM_ACCESS;
-    return VmemReadMapping[getVmemType(Inst)];
-  }
-
-  std::optional<WaitEventType>
-  getExpertSchedulingEventType(const MachineInstr &Inst) const;
-
-  bool isAsync(const MachineInstr &MI) const {
-    if (!SIInstrInfo::isLDSDMA(MI))
-      return false;
-    if (SIInstrInfo::usesASYNC_CNT(MI))
-      return true;
-    const MachineOperand *Async =
-        TII->getNamedOperand(MI, AMDGPU::OpName::IsAsync);
-    return Async && (Async->getImm());
-  }
-
-  bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
-    return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
-  }
-
-  bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
-    return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
-  }
-
-  bool isVmemAccess(const MachineInstr &MI) const;
-  bool generateWaitcntInstBefore(MachineInstr &MI,
-                                 WaitcntBrackets &ScoreBrackets,
-                                 MachineInstr *OldWaitcntInstr,
-                                 PreheaderFlushFlags FlushFlags);
-  bool generateWaitcnt(AMDGPU::Waitcnt Wait,
-                       MachineBasicBlock::instr_iterator It,
-                       MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
-                       MachineInstr *OldWaitcntInstr);
-  void updateEventWaitcntAfter(MachineInstr &Inst,
-                               WaitcntBrackets *ScoreBrackets);
-  bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
-                    MachineBasicBlock *Block) const;
-  bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
-                             WaitcntBrackets &ScoreBrackets);
-  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
-                            WaitcntBrackets &ScoreBrackets);
-  /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
-  /// Legalizer. Returns true if block was modified.
-  bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
-  void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                         bool ExpertMode) const;
-  const WaitEventSet &getWaitEvents(InstCounterType T) const {
-    return WCG->getWaitEvents(T);
-  }
-  InstCounterType getCounterFromEvent(WaitEventType E) const {
-    return WCG->getCounterFromEvent(E);
-  }
-};
+class SIInsertWaitcnts;
 
 // This objects maintains the current score brackets of each wait counter, and
 // a per-register scoreboard for each wait counter.
@@ -684,10 +499,7 @@ class SIInsertWaitcnts {
 // "s_waitcnt 0" before use.
 class WaitcntBrackets {
 public:
-  WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
-    assert(Context->TRI->getNumRegUnits() < REGUNITS_END);
-  }
-
+  WaitcntBrackets(const SIInsertWaitcnts *Context);
 #ifndef NDEBUG
   ~WaitcntBrackets() {
     unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
@@ -709,9 +521,7 @@ class WaitcntBrackets {
   }
 #endif
 
-  bool isSmemCounter(InstCounterType T) const {
-    return T == Context->SmemAccessCounter || T == X_CNT;
-  }
+  bool isSmemCounter(InstCounterType T) const;
 
   unsigned getSgprScoresIdx(InstCounterType T) const {
     assert(isSmemCounter(T) && "Invalid SMEM counter");
@@ -785,18 +595,8 @@ class WaitcntBrackets {
   bool hasPendingEvent(WaitEventType E) const {
     return PendingEvents.contains(E);
   }
-  bool hasPendingEvent(InstCounterType T) const {
-    bool HasPending = PendingEvents & Context->getWaitEvents(T);
-    assert(HasPending == !empty(T) &&
-           "Expected pending events iff scoreboard is not empty");
-    return HasPending;
-  }
-
-  bool hasMixedPendingEvents(InstCounterType T) const {
-    WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
-    // Return true if more than one bit is set in Events.
-    return Events.twoOrMore();
-  }
+  bool hasPendingEvent(InstCounterType T) const;
+  bool hasMixedPendingEvents(InstCounterType T) const;
 
   bool hasPendingFlat() const {
     return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
@@ -814,23 +614,13 @@ class WaitcntBrackets {
     return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
   }
 
-  unsigned getPendingGDSWait() const {
-    return std::min(getScoreUB(DS_CNT) - LastGDS,
-                    getWaitCountMax(Context->getLimits(), DS_CNT) - 1);
-  }
+  unsigned getPendingGDSWait() const;
 
   void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
 
   // Return true if there might be pending writes to the vgpr-interval by VMEM
   // instructions with types different from V.
-  bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
-    for (MCRegUnit RU : regunits(Reg)) {
-      auto It = VMem.find(toVMEMID(RU));
-      if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
-        return true;
-    }
-    return false;
-  }
+  bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const;
 
   void clearVgprVmemTypes(MCPhysReg Reg) {
     for (MCRegUnit RU : regunits(Reg)) {
@@ -842,11 +632,7 @@ class WaitcntBrackets {
     }
   }
 
-  void setStateOnFunctionEntryOrReturn() {
-    setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) +
-                              getWaitCountMax(Context->getLimits(), STORE_CNT));
-    PendingEvents |= Context->getWaitEvents(STORE_CNT);
-  }
+  void setStateOnFunctionEntryOrReturn();
 
   ArrayRef<const MachineInstr *> getLDSDMAStores() const {
     return LDSDMAStores;
@@ -881,50 +667,15 @@ class WaitcntBrackets {
   bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
                        ArrayRef<CounterValueArray> OtherMarks);
 
-  iterator_range<MCRegUnitIterator> regunits(MCPhysReg Reg) const {
-    assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
-    if (!Context->TRI->isInAllocatableClass(Reg))
-      return {{}, {}};
-    const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg);
-    unsigned Size = Context->TRI->getRegSizeInBits(*RC);
-    if (Size == 16 && Context->ST->hasD16Writes32BitVgpr())
-      Reg = Context->TRI->get32BitRegister(Reg);
-    return Context->TRI->regunits(Reg);
-  }
+  iterator_range<MCRegUnitIterator> regunits(MCPhysReg Reg) const;
 
   void setScoreLB(InstCounterType T, unsigned Val) {
     assert(T < NUM_INST_CNTS);
     ScoreLBs[T] = Val;
   }
 
-  void setScoreUB(InstCounterType T, unsigned Val) {
-    assert(T < NUM_INST_CNTS);
-    ScoreUBs[T] = Val;
-
-    if (T != EXP_CNT)
-      return;
-
-    if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT))
-      ScoreLBs[EXP_CNT] =
-          ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT);
-  }
-
-  void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
-    const SIRegisterInfo *TRI = Context->TRI;
-    if (Reg == AMDGPU::SCC) {
-      SCCScore = Val;
-    } else if (TRI->isVectorRegister(*Context->MRI, Reg)) {
-      for (MCRegUnit RU : regunits(Reg))
-        VMem[toVMEMID(RU)].Scores[T] = Val;
-    } else if (TRI->isSGPRReg(*Context->MRI, Reg)) {
-      auto STy = getSgprScoresIdx(T);
-      for (MCRegUnit RU : regunits(Reg))
-        SGPRs[RU].Scores[STy] = Val;
-    } else {
-      llvm_unreachable("Register cannot be tracked/unknown register!");
-    }
-  }
-
+  void setScoreUB(InstCounterType T, unsigned Val);
+  void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val);
   void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {
     VMem[TID].Scores[T] = Val;
   }
@@ -1000,6 +751,193 @@ class WaitcntBrackets {
   CounterValueArray AsyncScore{};
 };
 
+// Flags indicating which counters should be flushed in a loop preheader.
+struct PreheaderFlushFlags {
+  bool FlushVmCnt = false;
+  bool FlushDsCnt = false;
+};
+
+class SIInsertWaitcnts {
+public:
+  const GCNSubtarget *ST;
+  const SIInstrInfo *TII = nullptr;
+  const SIRegisterInfo *TRI = nullptr;
+  const MachineRegisterInfo *MRI = nullptr;
+  InstCounterType SmemAccessCounter;
+  InstCounterType MaxCounter;
+  bool IsExpertMode = false;
+
+private:
+  DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
+  DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
+  MachineLoopInfo *MLI;
+  MachinePostDominatorTree *PDT;
+  AliasAnalysis *AA = nullptr;
+
+  struct BlockInfo {
+    std::unique_ptr<WaitcntBrackets> Incoming;
+    bool Dirty = true;
+  };
+
+  MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
+
+  bool ForceEmitWaitcnt[NUM_INST_CNTS];
+
+  std::unique_ptr<WaitcntGenerator> WCG;
+
+  // Remember call and return instructions in the function.
+  DenseSet<MachineInstr *> CallInsts;
+  DenseSet<MachineInstr *> ReturnInsts;
+
+  // Remember all S_ENDPGM instructions. The boolean flag is true if there might
+  // be outstanding stores but definitely no outstanding scratch stores, to help
+  // with insertion of DEALLOC_VGPRS messages.
+  DenseMap<MachineInstr *, bool> EndPgmInsts;
+
+  AMDGPU::HardwareLimits Limits;
+
+public:
+  SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
+                   AliasAnalysis *AA)
+      : MLI(MLI), PDT(PDT), AA(AA) {
+    (void)ForceExpCounter;
+    (void)ForceLgkmCounter;
+    (void)ForceVMCounter;
+  }
+
+  const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
+
+  PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
+                                             const WaitcntBrackets &Brackets);
+  PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
+                                         const WaitcntBrackets &ScoreBrackets);
+  bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
+  bool isDSRead(const MachineInstr &MI) const;
+  bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
+  bool run(MachineFunction &MF);
+
+  void setForceEmitWaitcnt() {
+// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
+// For debug builds, get the debug counter info and adjust if need be
+#ifndef NDEBUG
+    if (DebugCounter::isCounterSet(ForceExpCounter) &&
+        DebugCounter::shouldExecute(ForceExpCounter)) {
+      ForceEmitWaitcnt[EXP_CNT] = true;
+    } else {
+      ForceEmitWaitcnt[EXP_CNT] = false;
+    }
+
+    if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
+        DebugCounter::shouldExecute(ForceLgkmCounter)) {
+      ForceEmitWaitcnt[DS_CNT] = true;
+      ForceEmitWaitcnt[KM_CNT] = true;
+    } else {
+      ForceEmitWaitcnt[DS_CNT] = false;
+      ForceEmitWaitcnt[KM_CNT] = false;
+    }
+
+    if (DebugCounter::isCounterSet(ForceVMCounter) &&
+        DebugCounter::shouldExecute(ForceVMCounter)) {
+      ForceEmitWaitcnt[LOAD_CNT] = true;
+      ForceEmitWaitcnt[SAMPLE_CNT] = true;
+      ForceEmitWaitcnt[BVH_CNT] = true;
+    } else {
+      ForceEmitWaitcnt[LOAD_CNT] = false;
+      ForceEmitWaitcnt[SAMPLE_CNT] = false;
+      ForceEmitWaitcnt[BVH_CNT] = false;
+    }
+
+    ForceEmitWaitcnt[VA_VDST] = false;
+    ForceEmitWaitcnt[VM_VSRC] = false;
+#endif // NDEBUG
+  }
+
+  // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
+  // instruction.
+  WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
+    switch (Inst.getOpcode()) {
+    // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
+    case AMDGPU::GLOBAL_INV:
+      return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
+                                // VGPRs
+    case AMDGPU::GLOBAL_WB:
+    case AMDGPU::GLOBAL_WBINV:
+      return VMEM_WRITE_ACCESS; // tracked using storecnt
+    default:
+      break;
+    }
+
+    // Maps VMEM access types to their corresponding WaitEventType.
+    static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
+        VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
+
+    assert(SIInstrInfo::isVMEM(Inst));
+    // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
+    // these should use VM_CNT.
+    if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
+      return VMEM_ACCESS;
+    if (Inst.mayStore() &&
+        (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
+      if (TII->mayAccessScratch(Inst))
+        return SCRATCH_WRITE_ACCESS;
+      return VMEM_WRITE_ACCESS;
+    }
+    if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
+      return VMEM_ACCESS;
+    return VmemReadMapping[getVmemType(Inst)];
+  }
+
+  std::optional<WaitEventType>
+  getExpertSchedulingEventType(const MachineInstr &Inst) const;
+
+  bool isAsync(const MachineInstr &MI) const {
+    if (!SIInstrInfo::isLDSDMA(MI))
+      return false;
+    if (SIInstrInfo::usesASYNC_CNT(MI))
+      return true;
+    const MachineOperand *Async =
+        TII->getNamedOperand(MI, AMDGPU::OpName::IsAsync);
+    return Async && (Async->getImm());
+  }
+
+  bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
+    return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
+  }
+
+  bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
+    return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
+  }
+
+  bool isVmemAccess(const MachineInstr &MI) const;
+  bool generateWaitcntInstBefore(MachineInstr &MI,
+                                 WaitcntBrackets &ScoreBrackets,
+                                 MachineInstr *OldWaitcntInstr,
+                                 PreheaderFlushFlags FlushFlags);
+  bool generateWaitcnt(AMDGPU::Waitcnt Wait,
+                       MachineBasicBlock::instr_iterator It,
+                       MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
+                       MachineInstr *OldWaitcntInstr);
+  void updateEventWaitcntAfter(MachineInstr &Inst,
+                               WaitcntBrackets *ScoreBrackets);
+  bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
+                    MachineBasicBlock *Block) const;
+  bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
+                             WaitcntBrackets &ScoreBrackets);
+  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
+                            WaitcntBrackets &ScoreBrackets);
+  /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
+  /// Legalizer. Returns true if block was modified.
+  bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
+  void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                         bool ExpertMode) const;
+  const WaitEventSet &getWaitEvents(InstCounterType T) const {
+    return WCG->getWaitEvents(T);
+  }
+  InstCounterType getCounterFromEvent(WaitEventType E) const {
+    return WCG->getCounterFromEvent(E);
+  }
+};
+
 class SIInsertWaitcntsLegacy : public MachineFunctionPass {
 public:
   static char ID;
@@ -1021,6 +959,90 @@ class SIInsertWaitcntsLegacy : public MachineFunctionPass {
   }
 };
 
+WaitcntBrackets::WaitcntBrackets(const SIInsertWaitcnts *Context)
+    : Context{Context} {
+  assert(Context->TRI->getNumRegUnits() < REGUNITS_END);
+}
+
+bool WaitcntBrackets::isSmemCounter(InstCounterType T) const {
+  return T == Context->SmemAccessCounter || T == X_CNT;
+}
+
+bool WaitcntBrackets::hasPendingEvent(InstCounterType T) const {
+  bool HasPending = PendingEvents & Context->getWaitEvents(T);
+  assert(HasPending == !empty(T) &&
+         "Expected pending events iff scoreboard is not empty");
+  return HasPending;
+}
+
+bool WaitcntBrackets::hasMixedPendingEvents(InstCounterType T) const {
+  WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
+  // Return true if more than one bit is set in Events.
+  return Events.twoOrMore();
+}
+
+unsigned WaitcntBrackets::getPendingGDSWait() const {
+  return std::min(getScoreUB(DS_CNT) - LastGDS,
+                  getWaitCountMax(Context->getLimits(), DS_CNT) - 1);
+}
+
+bool WaitcntBrackets::hasOtherPendingVmemTypes(MCPhysReg Reg,
+                                               VmemType V) const {
+  for (MCRegUnit RU : regunits(Reg)) {
+    auto It = VMem.find(toVMEMID(RU));
+    if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
+      return true;
+  }
+  return false;
+}
+
+void WaitcntBrackets::setStateOnFunctionEntryOrReturn() {
+  setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) +
+                            getWaitCountMax(Context->getLimits(), STORE_CNT));
+  PendingEvents |= Context->getWaitEvents(STORE_CNT);
+}
+
+iterator_range<MCRegUnitIterator>
+WaitcntBrackets::regunits(MCPhysReg Reg) const {
+  assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
+  if (!Context->TRI->isInAllocatableClass(Reg))
+    return {{}, {}};
+  const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg);
+  unsigned Size = Context->TRI->getRegSizeInBits(*RC);
+  if (Size == 16 && Context->ST->hasD16Writes32BitVgpr())
+    Reg = Context->TRI->get32BitRegister(Reg);
+  return Context->TRI->regunits(Reg);
+}
+
+void WaitcntBrackets::setScoreUB(InstCounterType T, unsigned Val) {
+  assert(T < NUM_INST_CNTS);
+  ScoreUBs[T] = Val;
+
+  if (T != EXP_CNT)
+    return;
+
+  if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT))
+    ScoreLBs[EXP_CNT] =
+        ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT);
+}
+
+void WaitcntBrackets::setRegScore(MCPhysReg Reg, InstCounterType T,
+                                  unsigned Val) {
+  const SIRegisterInfo *TRI = Context->TRI;
+  if (Reg == AMDGPU::SCC) {
+    SCCScore = Val;
+  } else if (TRI->isVectorRegister(*Context->MRI, Reg)) {
+    for (MCRegUnit RU : regunits(Reg))
+      VMem[toVMEMID(RU)].Scores[T] = Val;
+  } else if (TRI->isSGPRReg(*Context->MRI, Reg)) {
+    auto STy = getSgprScoresIdx(T);
+    for (MCRegUnit RU : regunits(Reg))
+      SGPRs[RU].Scores[STy] = Val;
+  } else {
+    llvm_unreachable("Register cannot be tracked/unknown register!");
+  }
+}
+
 } // end anonymous namespace
 
 void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index abe081c0c76fd..0c618d7582d4c 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -40,6 +40,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
+BPFAsmPrinter::BPFAsmPrinter(TargetMachine &TM,
+                             std::unique_ptr<MCStreamer> Streamer)
+    : AsmPrinter(TM, std::move(Streamer), ID), BTF(nullptr), TM(TM) {}
+
 bool BPFAsmPrinter::doInitialization(Module &M) {
   AsmPrinter::doInitialization(M);
 
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.h b/llvm/lib/Target/BPF/BPFAsmPrinter.h
index 75a1d7ed9f884..1031a78a1fe2f 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.h
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.h
@@ -18,8 +18,7 @@ namespace llvm {
 class BPFAsmPrinter : public AsmPrinter {
 public:
   explicit BPFAsmPrinter(TargetMachine &TM,
-                         std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer), ID), BTF(nullptr), TM(TM) {}
+                         std::unique_ptr<MCStreamer> Streamer);
 
   StringRef getPassName() const override { return "BPF Assembly Printer"; }
   bool doInitialization(Module &M) override;