[llvm] [AMDGPU] Optionally Use GCNRPTrackers during scheduling (PR #93090)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 7 16:22:28 PDT 2024


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/93090

>From 434b5983e22da95a4a1648316c576f4af16ae02f Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 21 May 2024 12:55:07 -0700
Subject: [PATCH 01/26] [AMDGPU] NFC: Add BBLiveOutMap & LiveOut Cache

Change-Id: I63cfd44e635cc4bee0e6780ca43b692c46e940b7
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 58 ++++++++++++++++++---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h   | 42 ++++++++++++++-
 2 files changed, 91 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index d6958d9055fade..0a1a72c230db85 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -58,6 +58,11 @@ static cl::opt<bool>
                         "Wave Limited (amdgpu-limit-wave-threshold)."),
                cl::init(false));
 
+static cl::opt<bool> GCNTrackers(
+    "amdgpu-use-amdgpu-trackers", cl::Hidden,
+    cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
+    cl::init(false));
+
 const unsigned ScheduleMetrics::ScaleFactor = 100;
 
 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -571,7 +576,8 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(
     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
     : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
       MFI(*MF.getInfo<SIMachineFunctionInfo>()),
-      StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) {
+      StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
+      RegionLiveOuts(this, /*IsLiveOut=*/true) {
 
   LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
   if (RelaxedOcc) {
@@ -613,6 +619,14 @@ GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
   return RPTracker.moveMaxPressure();
 }
 
+static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
+                                        MachineBasicBlock::iterator RegionEnd) {
+  auto REnd = RegionEnd == RegionBegin->getParent()->end()
+                  ? std::prev(RegionEnd)
+                  : RegionEnd;
+  return &*skipDebugInstructionsBackward(REnd, RegionBegin);
+}
+
 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
                                                 const MachineBasicBlock *MBB) {
   GCNDownwardRPTracker RPTracker(*LIS);
@@ -687,20 +701,45 @@ void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
 }
 
 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
-GCNScheduleDAGMILive::getBBLiveInMap() const {
+GCNScheduleDAGMILive::getRegionLiveInMap() const {
   assert(!Regions.empty());
-  std::vector<MachineInstr *> BBStarters;
-  BBStarters.reserve(Regions.size());
+  std::vector<MachineInstr *> RegionFirstMIs;
+  RegionFirstMIs.reserve(Regions.size());
   auto I = Regions.rbegin(), E = Regions.rend();
   auto *BB = I->first->getParent();
   do {
     auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
-    BBStarters.push_back(MI);
+    RegionFirstMIs.push_back(MI);
     do {
       ++I;
     } while (I != E && I->first->getParent() == BB);
   } while (I != E);
-  return getLiveRegMap(BBStarters, false /*After*/, *LIS);
+  return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
+}
+
+DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
+GCNScheduleDAGMILive::getRegionLiveOutMap() const {
+  assert(!Regions.empty());
+  std::vector<MachineInstr *> RegionLastMIs;
+  RegionLastMIs.reserve(Regions.size());
+  for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
+    RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
+
+  return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
+}
+
+void RegionPressureMap::buildLiveRegMap() {
+  IdxToInstruction.clear();
+
+  BBLiveRegMap =
+      IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
+  for (unsigned I = 0; I < DAG->Regions.size(); I++) {
+    MachineInstr *RegionKey =
+        IsLiveOut
+            ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second)
+            : &*DAG->Regions[I].first;
+    IdxToInstruction[I] = RegionKey;
+  }
 }
 
 void GCNScheduleDAGMILive::finalizeSchedule() {
@@ -726,8 +765,11 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
 void GCNScheduleDAGMILive::runSchedStages() {
   LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
 
-  if (!Regions.empty())
-    BBLiveInMap = getBBLiveInMap();
+  if (!Regions.empty()) {
+    BBLiveInMap = getRegionLiveInMap();
+    if (GCNTrackers)
+      RegionLiveOuts.buildLiveRegMap();
+  }
 
   GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
   while (S.advanceStage()) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index f0aea2bc4ab865..c402fb1ef373c9 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -163,6 +163,32 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) {
   return OS;
 }
 
+class GCNScheduleDAGMILive;
+class RegionPressureMap {
+  GCNScheduleDAGMILive *DAG;
+  // The live in/out pressure as indexed by the first or last MI in the region
+  // before scheduling.
+  DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveRegMap;
+  // The mapping of RegionIDx to key instruction
+  DenseMap<unsigned, MachineInstr *> IdxToInstruction;
+  // Whether we are calculating LiveOuts or LiveIns
+  bool IsLiveOut;
+
+public:
+  RegionPressureMap() {}
+  RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut)
+      : DAG(GCNDAG), IsLiveOut(LiveOut) {}
+  // Build the Instr->LiveReg and RegionIdx->Instr maps
+  void buildLiveRegMap();
+
+  // Retrieve the LiveReg for a given RegionIdx
+  GCNRPTracker::LiveRegSet &getLiveRegsForRegionIdx(unsigned RegionIdx) {
+    assert(IdxToInstruction.find(RegionIdx) != IdxToInstruction.end());
+    MachineInstr *Key = IdxToInstruction[RegionIdx];
+    return BBLiveRegMap[Key];
+  }
+};
+
 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   friend class GCNSchedStage;
   friend class OccInitialScheduleStage;
@@ -170,6 +196,7 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   friend class ClusteredLowOccStage;
   friend class PreRARematStage;
   friend class ILPInitialScheduleStage;
+  friend class RegionPressureMap;
 
   const GCNSubtarget &ST;
 
@@ -211,9 +238,22 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   // Temporary basic block live-in cache.
   DenseMap<const MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBLiveIns;
 
+  // The map of the initial first region instruction to region live in registers
   DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
 
-  DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
+  // Calculate the map of the initial first region instruction to region live in
+  // registers
+  DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getRegionLiveInMap() const;
+
+  // Calculate the map of the initial last region instruction to region live out
+  // registers
+  DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
+  getRegionLiveOutMap() const;
+
+  // The live out registers per region. These are internally stored as a map of
+  // the initial last region instruction to region live out registers, but can
+  // be retreived with the regionIdx by calls to getLiveRegsForRegionIdx.
+  RegionPressureMap RegionLiveOuts;
 
   // Return current region pressure.
   GCNRegPressure getRealRegPressure(unsigned RegionIdx) const;

>From 6a57763122b140007aadc27ec9108762f5de350f Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 21 May 2024 13:34:59 -0700
Subject: [PATCH 02/26] [AMDGPU] NFC: Provide RPTracker interface for external
 iterators

Change-Id: I79b54722e6e858961486248d94766c3f3c161160
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 284 ++++++++++++++++++++--
 llvm/lib/Target/AMDGPU/GCNRegPressure.h   |  95 ++++++--
 2 files changed, 330 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index cb0624f11592d2..d1a50adc1918cf 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -289,6 +289,72 @@ collectVirtualRegUses(SmallVectorImpl<RegisterMaskPair> &RegMaskPairs,
   }
 }
 
+static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
+                               Register RegUnit) {
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+    return Other.RegUnit == RegUnit;
+  });
+  if (I == RegUnits.end())
+    return LaneBitmask::getNone();
+  return I->LaneMask;
+}
+
+static LaneBitmask
+getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
+                     bool TrackLaneMasks, Register RegUnit, SlotIndex Pos,
+                     LaneBitmask SafeDefault,
+                     bool (*Property)(const LiveRange &LR, SlotIndex Pos)) {
+  if (RegUnit.isVirtual()) {
+    const LiveInterval &LI = LIS.getInterval(RegUnit);
+    LaneBitmask Result;
+    if (TrackLaneMasks && LI.hasSubRanges()) {
+      for (const LiveInterval::SubRange &SR : LI.subranges()) {
+        if (Property(SR, Pos))
+          Result |= SR.LaneMask;
+      }
+    } else if (Property(LI, Pos)) {
+      Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit)
+                              : LaneBitmask::getAll();
+    }
+
+    return Result;
+  } else {
+    const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
+    // Be prepared for missing liveranges: We usually do not compute liveranges
+    // for physical registers on targets with many registers (GPUs).
+    if (LR == nullptr)
+      return SafeDefault;
+    return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone();
+  }
+}
+
+/// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
+/// The query starts with a lane bitmask which gets lanes/bits removed for every
+/// use we find.
+static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
+                                  SlotIndex PriorUseIdx, SlotIndex NextUseIdx,
+                                  const MachineRegisterInfo &MRI,
+                                  const LiveIntervals *LIS,
+                                  bool Upward = false) {
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+    if (MO.isUndef())
+      continue;
+    const MachineInstr *MI = MO.getParent();
+    SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
+    bool InRange = Upward ? (InstSlot > PriorUseIdx && InstSlot <= NextUseIdx)
+                          : (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx);
+    if (InRange) {
+      unsigned SubRegIdx = MO.getSubReg();
+      LaneBitmask UseMask = TRI.getSubRegIndexLaneMask(SubRegIdx);
+      LastUseMask &= ~UseMask;
+      if (LastUseMask.none())
+        return LaneBitmask::getNone();
+    }
+  }
+  return LastUseMask;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // GCNRPTracker
 
@@ -344,17 +410,47 @@ void GCNRPTracker::reset(const MachineInstr &MI,
   MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
 }
 
-////////////////////////////////////////////////////////////////////////////////
-// GCNUpwardRPTracker
-
-void GCNUpwardRPTracker::reset(const MachineRegisterInfo &MRI_,
-                               const LiveRegSet &LiveRegs_) {
+void GCNRPTracker::reset(const MachineRegisterInfo &MRI_,
+                         const LiveRegSet &LiveRegs_) {
   MRI = &MRI_;
   LiveRegs = LiveRegs_;
   LastTrackedMI = nullptr;
   MaxPressure = CurPressure = getRegPressure(MRI_, LiveRegs_);
 }
 
+void GCNRPTracker::bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs) {
+  for (const RegisterMaskPair &P : DeadDefs) {
+    Register Reg = P.RegUnit;
+    if (!Reg.isVirtual())
+      continue;
+    LaneBitmask LiveMask = LiveRegs[Reg];
+    LaneBitmask BumpedMask = LiveMask | P.LaneMask;
+    CurPressure.inc(Reg, LiveMask, BumpedMask, *MRI);
+  }
+  MaxPressure = max(MaxPressure, CurPressure);
+  for (const RegisterMaskPair &P : DeadDefs) {
+    Register Reg = P.RegUnit;
+    if (!Reg.isVirtual())
+      continue;
+    LaneBitmask LiveMask = LiveRegs[Reg];
+    LaneBitmask BumpedMask = LiveMask | P.LaneMask;
+    CurPressure.inc(Reg, BumpedMask, LiveMask, *MRI);
+  }
+}
+
+LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit,
+                                           SlotIndex Pos) const {
+  return getLanesWithProperty(
+      LIS, *MRI, true, RegUnit, Pos.getBaseIndex(), LaneBitmask::getNone(),
+      [](const LiveRange &LR, SlotIndex Pos) {
+        const LiveRange::Segment *S = LR.getSegmentContaining(Pos);
+        return S != nullptr && S->end == Pos.getRegSlot();
+      });
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// GCNUpwardRPTracker
+
 void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   assert(MRI && "call reset first");
 
@@ -415,6 +511,63 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   assert(CurPressure == getRegPressure(*MRI, LiveRegs));
 }
 
+void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI) {
+  assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
+
+  SlotIndex SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot();
+
+  // Account for register pressure similar to RegPressureTracker::recede().
+  RegisterOperands RegOpers;
+  const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+  RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/true);
+  assert(RegOpers.DeadDefs.empty());
+  RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
+  RegOpers.detectDeadDefs(*MI, LIS);
+
+  // Boost max pressure for all dead defs together.
+  // Since CurrSetPressure and MaxSetPressure
+  bumpDeadDefs(RegOpers.DeadDefs);
+
+  // Kill liveness at live defs.
+  for (const RegisterMaskPair &P : RegOpers.Defs) {
+    Register Reg = P.RegUnit;
+    if (!Reg.isVirtual())
+      continue;
+    LaneBitmask LiveAfter = LiveRegs[Reg];
+    LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, Reg);
+    LaneBitmask DefLanes = P.LaneMask;
+    LaneBitmask LiveBefore = (LiveAfter & ~DefLanes) | UseLanes;
+
+    // There may be parts of the register that were dead before the
+    // instruction, but became live afterwards. Similarly, some parts
+    // may have been killed in this instruction.
+    CurPressure.inc(Reg, LiveAfter, LiveAfter & LiveBefore, *MRI);
+    CurPressure.inc(Reg, LiveAfter, ~LiveAfter & LiveBefore, *MRI);
+    MaxPressure = max(MaxPressure, CurPressure);
+  }
+  // Generate liveness for uses.
+  for (const RegisterMaskPair &P : RegOpers.Uses) {
+    Register Reg = P.RegUnit;
+    if (!Reg.isVirtual())
+      continue;
+    // If this register was also in a def operand, we've handled it
+    // with defs.
+    if (getRegLanes(RegOpers.Defs, Reg).any())
+      continue;
+    LaneBitmask LiveAfter = LiveRegs[Reg];
+    SlotIndex CurrIdx =
+        LastTrackedMI ? LIS.getInstructionIndex(*LastTrackedMI).getRegSlot()
+                      : LIS.getMBBEndIdx(MI->getParent());
+    ;
+    LaneBitmask LastUseMask =
+        findUseBetween(Reg, P.LaneMask, SlotIdx, CurrIdx, *MRI, &LIS, true);
+    LastUseMask &= ~LiveAfter;
+    LaneBitmask LiveBefore = (LiveAfter | LastUseMask);
+    CurPressure.inc(Reg, LiveAfter, LiveBefore, *MRI);
+  }
+  MaxPressure = max(MaxPressure, CurPressure);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // GCNDownwardRPTracker
 
@@ -431,28 +584,44 @@ bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
   return true;
 }
 
-bool GCNDownwardRPTracker::advanceBeforeNext() {
+bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI,
+                                             bool UseInternalIterator,
+                                             LiveIntervals *TheLIS) {
   assert(MRI && "call reset first");
-  if (!LastTrackedMI)
-    return NextMI == MBBEnd;
-
-  assert(NextMI == MBBEnd || !NextMI->isDebugInstr());
+  SlotIndex SI;
+  LiveIntervals *CurrLIS;
+  MachineInstr *CurrMI;
+  if (UseInternalIterator) {
+    if (!LastTrackedMI)
+      return NextMI == MBBEnd;
+
+    assert(NextMI == MBBEnd || !NextMI->isDebugInstr());
+    CurrLIS = const_cast<LiveIntervals *>(&LIS);
+    CurrMI = const_cast<MachineInstr *>(LastTrackedMI);
+
+    SI = NextMI == MBBEnd
+             ? CurrLIS->getInstructionIndex(*LastTrackedMI).getDeadSlot()
+             : CurrLIS->getInstructionIndex(*NextMI).getBaseIndex();
+  } else { //! UseInternalIterator
+    CurrLIS = TheLIS;
+    SI = CurrLIS->getInstructionIndex(*MI).getBaseIndex();
+    CurrMI = MI;
+  }
 
-  SlotIndex SI = NextMI == MBBEnd
-                     ? LIS.getInstructionIndex(*LastTrackedMI).getDeadSlot()
-                     : LIS.getInstructionIndex(*NextMI).getBaseIndex();
   assert(SI.isValid());
 
   // Remove dead registers or mask bits.
   SmallSet<Register, 8> SeenRegs;
-  for (auto &MO : LastTrackedMI->operands()) {
+  for (auto &MO : CurrMI->operands()) {
     if (!MO.isReg() || !MO.getReg().isVirtual())
       continue;
     if (MO.isUse() && !MO.readsReg())
       continue;
+    if (!UseInternalIterator && MO.isDef())
+      continue;
     if (!SeenRegs.insert(MO.getReg()).second)
       continue;
-    const LiveInterval &LI = LIS.getInterval(MO.getReg());
+    const LiveInterval &LI = CurrLIS->getInterval(MO.getReg());
     if (LI.hasSubRanges()) {
       auto It = LiveRegs.end();
       for (const auto &S : LI.subranges()) {
@@ -482,15 +651,22 @@ bool GCNDownwardRPTracker::advanceBeforeNext() {
 
   LastTrackedMI = nullptr;
 
-  return NextMI == MBBEnd;
+  return UseInternalIterator && (NextMI == MBBEnd);
 }
 
-void GCNDownwardRPTracker::advanceToNext() {
-  LastTrackedMI = &*NextMI++;
-  NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+void GCNDownwardRPTracker::advanceToNext(MachineInstr *MI,
+                                         bool UseInternalIterator) {
+  if (UseInternalIterator) {
+    LastTrackedMI = &*NextMI++;
+    NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+  } else {
+    LastTrackedMI = MI;
+  }
+
+  MachineInstr *CurrMI = const_cast<MachineInstr *>(LastTrackedMI);
 
   // Add new registers or mask bits.
-  for (const auto &MO : LastTrackedMI->all_defs()) {
+  for (const auto &MO : CurrMI->all_defs()) {
     Register Reg = MO.getReg();
     if (!Reg.isVirtual())
       continue;
@@ -503,11 +679,12 @@ void GCNDownwardRPTracker::advanceToNext() {
   MaxPressure = max(MaxPressure, CurPressure);
 }
 
-bool GCNDownwardRPTracker::advance() {
-  if (NextMI == MBBEnd)
+bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator,
+                                   LiveIntervals *TheLIS) {
+  if (UseInternalIterator && NextMI == MBBEnd)
     return false;
-  advanceBeforeNext();
-  advanceToNext();
+  advanceBeforeNext(MI, UseInternalIterator, TheLIS);
+  advanceToNext(MI, UseInternalIterator);
   return true;
 }
 
@@ -549,6 +726,65 @@ Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
   });
 }
 
+void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI) {
+  assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
+
+  SlotIndex SlotIdx;
+  SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot();
+
+  // Account for register pressure similar to RegPressureTracker::recede().
+  RegisterOperands RegOpers;
+  const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+  RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/false);
+  RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
+
+  for (const RegisterMaskPair &Use : RegOpers.Uses) {
+    Register Reg = Use.RegUnit;
+    if (!Reg.isVirtual())
+      continue;
+    LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
+    if (LastUseMask.none())
+      continue;
+    // The LastUseMask is queried from the liveness information of instruction
+    // which may be further down the schedule. Some lanes may actually not be
+    // last uses for the current position.
+    // FIXME: allow the caller to pass in the list of vreg uses that remain
+    // to be bottom-scheduled to avoid searching uses at each query.
+    SlotIndex CurrIdx;
+    const MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::const_iterator IdxPos = skipDebugInstructionsForward(
+        LastTrackedMI ? LastTrackedMI : MBB->begin(), MBB->end());
+    if (IdxPos == MBB->end()) {
+      CurrIdx = LIS.getMBBEndIdx(MBB);
+    } else {
+      CurrIdx = LIS.getInstructionIndex(*IdxPos).getRegSlot();
+    }
+
+    LastUseMask =
+        findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, &LIS);
+    if (LastUseMask.none())
+      continue;
+
+    LaneBitmask LiveMask = LiveRegs[Reg];
+    LaneBitmask NewMask = LiveMask & ~LastUseMask;
+    CurPressure.inc(Reg, LiveMask, NewMask, *MRI);
+  }
+
+  // Generate liveness for defs.
+  for (const RegisterMaskPair &Def : RegOpers.Defs) {
+    Register Reg = Def.RegUnit;
+    if (!Reg.isVirtual())
+      continue;
+    LaneBitmask LiveMask = LiveRegs[Reg];
+    LaneBitmask NewMask = LiveMask | Def.LaneMask;
+    CurPressure.inc(Reg, LiveMask, NewMask, *MRI);
+  }
+  MaxPressure = max(MaxPressure, CurPressure);
+
+  // Boost pressure for all dead defs together.
+  bumpDeadDefs(RegOpers.DeadDefs);
+}
+
 bool GCNUpwardRPTracker::isValid() const {
   const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex();
   const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 54dc1972d27619..a79e412ce33449 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -19,6 +19,7 @@
 
 #include "GCNSubtarget.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/RegisterPressure.h"
 #include <algorithm>
 
 namespace llvm {
@@ -149,6 +150,9 @@ inline GCNRegPressure operator-(const GCNRegPressure &P1,
   return Diff;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// GCNRPTracker
+
 class GCNRPTracker {
 public:
   using LiveRegSet = DenseMap<unsigned, LaneBitmask>;
@@ -165,7 +169,14 @@ class GCNRPTracker {
   void reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy,
              bool After);
 
+  void bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs);
+
+  LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const;
+
 public:
+  // reset tracker and set live register set to the specified value.
+  void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_);
+
   // live regs for the current state
   const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
   const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; }
@@ -182,34 +193,40 @@ class GCNRPTracker {
 GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
                                      const MachineRegisterInfo &MRI);
 
+////////////////////////////////////////////////////////////////////////////////
+// GCNUpwardRPTracker
+
 class GCNUpwardRPTracker : public GCNRPTracker {
 public:
   GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
 
-  // reset tracker and set live register set to the specified value.
-  void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_);
+  using GCNRPTracker::reset;
 
-  // reset tracker at the specified slot index.
+  /// reset tracker at the specified slot index \p SI.
   void reset(const MachineRegisterInfo &MRI, SlotIndex SI) {
-    reset(MRI, llvm::getLiveRegs(SI, LIS, MRI));
+    GCNRPTracker::reset(MRI, llvm::getLiveRegs(SI, LIS, MRI));
   }
 
-  // reset tracker to the end of the MBB.
+  /// reset tracker to the end of the \p MBB.
   void reset(const MachineBasicBlock &MBB) {
     reset(MBB.getParent()->getRegInfo(),
           LIS.getSlotIndexes()->getMBBEndIdx(&MBB));
   }
 
-  // reset tracker to the point just after MI (in program order).
+  /// reset tracker to the point just after \p MI (in program order).
   void reset(const MachineInstr &MI) {
     reset(MI.getMF()->getRegInfo(), LIS.getInstructionIndex(MI).getDeadSlot());
   }
 
-  // move to the state just before the MI (in program order).
+  /// Move to the state of RP just before the \p MI . If \p UseInternalIterator
+  /// is set, also update the internal iterators. Setting \p UseInternalIterator
+  /// to false allows for an externally managed iterator / program order.
   void recede(const MachineInstr &MI);
 
-  // checks whether the tracker's state after receding MI corresponds
-  // to reported by LIS.
+  void bumpUpwardPressure(const MachineInstr *MI);
+
+  /// \p returns whether the tracker's state after receding MI corresponds
+  /// to reported by LIS.
   bool isValid() const;
 
   const GCNRegPressure &getMaxPressure() const { return MaxPressure; }
@@ -223,6 +240,9 @@ class GCNUpwardRPTracker : public GCNRPTracker {
   }
 };
 
+////////////////////////////////////////////////////////////////////////////////
+// GCNDownwardRPTracker
+
 class GCNDownwardRPTracker : public GCNRPTracker {
   // Last position of reset or advanceBeforeNext
   MachineBasicBlock::const_iterator NextMI;
@@ -232,37 +252,62 @@ class GCNDownwardRPTracker : public GCNRPTracker {
 public:
   GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
 
+  using GCNRPTracker::reset;
+
   MachineBasicBlock::const_iterator getNext() const { return NextMI; }
 
-  // Return MaxPressure and clear it.
+  /// \p return MaxPressure and clear it.
   GCNRegPressure moveMaxPressure() {
     auto Res = MaxPressure;
     MaxPressure.clear();
     return Res;
   }
 
-  // Reset tracker to the point before the MI
-  // filling live regs upon this point using LIS.
-  // Returns false if block is empty except debug values.
+  /// Reset tracker to the point before the \p MI
+  /// filling \p LiveRegs upon this point using LIS.
+  /// \p returns false if block is empty except debug values.
   bool reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
 
-  // Move to the state right before the next MI or after the end of MBB.
-  // Returns false if reached end of the block.
-  bool advanceBeforeNext();
-
-  // Move to the state at the MI, advanceBeforeNext has to be called first.
-  void advanceToNext();
-
-  // Move to the state at the next MI. Returns false if reached end of block.
-  bool advance();
-
-  // Advance instructions until before End.
+  /// Move to the state right before the next MI or after the end of MBB.
+  /// \p returns false if reached end of the block.
+  /// If \p UseInternalIterator is true, then internal iterators are used and
+  /// set to process in program order. If \p UseInternalIterator is false, then
+  /// it is assumed that the tracker is using an externally managed iterator,
+  /// and advance* calls will not update the state of the iterator. In such
+  /// cases, the tracker will move to the state right before the provided \p MI
+  /// and use the provided \p TheLIS for RP calculations.
+  bool advanceBeforeNext(MachineInstr *MI = nullptr,
+                         bool UseInternalIterator = true,
+                         LiveIntervals *TheLIS = nullptr);
+
+  /// Move to the state at the MI, advanceBeforeNext has to be called first.
+  /// If \p UseInternalIterator is true, then internal iterators are used and
+  /// set to process in program order. If \p UseInternalIterator is false, then
+  /// it is assumed that the tracker is using an externally managed iterator,
+  /// and advance* calls will not update the state of the iterator. In such
+  /// cases, the tracker will move to the state at the provided \p MI .
+  void advanceToNext(MachineInstr *MI = nullptr,
+                     bool UseInternalIterator = true);
+
+  /// Move to the state at the next MI. \p returns false if reached end of
+  /// block. If \p UseInternalIterator is true, then internal iterators are used
+  /// and set to process in program order. If \p UseInternalIterator is false,
+  /// then it is assumed that the tracker is using an externally managed
+  /// iterator, and advance* calls will not update the state of the iterator. In
+  /// such cases, the tracker will move to the state right before the provided
+  /// \p MI and use the provided \p TheLIS for RP calculations.
+  bool advance(MachineInstr *MI = nullptr, bool UseInternalIterator = true,
+               LiveIntervals *TheLIS = nullptr);
+
+  /// Advance instructions until before \p End.
   bool advance(MachineBasicBlock::const_iterator End);
 
-  // Reset to Begin and advance to End.
+  /// Reset to \p Begin and advance to \p End.
   bool advance(MachineBasicBlock::const_iterator Begin,
                MachineBasicBlock::const_iterator End,
                const LiveRegSet *LiveRegsCopy = nullptr);
+
+  void bumpDownwardPressure(const MachineInstr *MI);
 };
 
 LaneBitmask getLiveLaneMask(unsigned Reg,

>From b625761fea8816058228f63c05b563723390f62c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 21 May 2024 18:04:25 -0700
Subject: [PATCH 03/26] [AMDGPU] Optionally Use AMDGPU RPTrackers during
 scheduling

Change-Id: I6ae56149c1eb49ea85362267174cc6274c416330
---
 .../Target/AMDGPU/GCNIterativeScheduler.cpp   |  2 +-
 llvm/lib/Target/AMDGPU/GCNRegPressure.h       |  1 -
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 90 ++++++++++++++++---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     | 19 +++-
 4 files changed, 96 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 13504508e2fb2e..9b1db3241e4327 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -480,7 +480,7 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
   LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
                        "target occupancy = "
                     << TgtOcc << '\n');
-  GCNMaxOccupancySchedStrategy LStrgy(Context);
+  GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler*/ true);
   unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
 
   for (int I = 0; I < NumPasses; ++I) {
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index a79e412ce33449..f78e4d7da0a1dd 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -176,7 +176,6 @@ class GCNRPTracker {
 public:
   // reset tracker and set live register set to the specified value.
   void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_);
-
   // live regs for the current state
   const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
   const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; }
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 0a1a72c230db85..1e6d95d128709d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -67,6 +67,7 @@ const unsigned ScheduleMetrics::ScaleFactor = 100;
 
 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
     : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
+      TheTracker(*C->LIS), TheUpwardTracker(*C->LIS),
       HasHighPressure(false) {}
 
 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
@@ -156,14 +157,37 @@ static bool canUsePressureDiffs(const SUnit &SU) {
 static void getRegisterPressures(bool AtTop,
                                  const RegPressureTracker &RPTracker, SUnit *SU,
                                  std::vector<unsigned> &Pressure,
-                                 std::vector<unsigned> &MaxPressure) {
+                                 std::vector<unsigned> &MaxPressure,
+                                 GCNDownwardRPTracker &TheTracker,
+                                 GCNUpwardRPTracker &TheUpwardTracker,
+                                 ScheduleDAGMI *DAG) {
   // getDownwardPressure() and getUpwardPressure() make temporary changes to
   // the tracker, so we need to pass those function a non-const copy.
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
-  if (AtTop)
-    TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
-  else
-    TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+  if (!GCNTrackers) {
+    if (AtTop)
+      TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
+    else
+      TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+  } else {
+    if (AtTop) {
+      GCNDownwardRPTracker TempTopTracker(TheTracker);
+      auto MI = SU->getInstr();
+      TempTopTracker.advance(MI, true, DAG->getLIS());
+
+      Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempTopTracker.getPressure().getSGPRNum();
+      Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = TempTopTracker.getPressure().getVGPRNum(false);
+    }
+
+    else {
+      GCNUpwardRPTracker TempBotTracker(TheUpwardTracker);
+      auto MI = SU->getInstr();
+      TempBotTracker.recede(*MI, true);
+
+      Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempBotTracker.getPressure().getSGPRNum();
+      Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = TempBotTracker.getPressure().getVGPRNum(false);
+    }
+  }
 }
 
 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
@@ -192,8 +216,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
   //
   // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
   // PressureDiffs.
-  if (AtTop || !canUsePressureDiffs(*SU)) {
-    getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure);
+  if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
+    getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, TheTracker, TheUpwardTracker, DAG);
   } else {
     // Reserve 4 slots.
     Pressure.resize(4, 0);
@@ -211,7 +235,11 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
 
 #ifdef EXPENSIVE_CHECKS
     std::vector<unsigned> CheckPressure, CheckMaxPressure;
+<<<<<<< HEAD
     getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure);
+=======
+    getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,TheTracker,TheUpwardTracker, DAG);
+>>>>>>> 3fc6929b4a78... [AMDGPU] Optionally Use AMDGPU RPTrackers during scheduling
     if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
             CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
         Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
@@ -299,8 +327,16 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
   unsigned SGPRPressure = 0;
   unsigned VGPRPressure = 0;
   if (DAG->isTrackingPressure()) {
-    SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
-    VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+    SGPRPressure =
+        GCNTrackers
+            ? (Zone.isTop() ? TheTracker.getPressure().getSGPRNum()
+                            : TheUpwardTracker.getPressure().getSGPRNum())
+            : Pressure[AMDGPU::RegisterPressureSets::SReg_32];
+    VGPRPressure =
+        GCNTrackers
+            ? (Zone.isTop() ? TheTracker.getPressure().getVGPRNum(false)
+                            : TheUpwardTracker.getPressure().getVGPRNum(false))
+            : Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
   }
   ReadyQueue &Q = Zone.Available;
   for (SUnit *SU : Q) {
@@ -449,6 +485,16 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
   return SU;
 }
 
+void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+  if (GCNTrackers) {
+    MachineInstr *MI = SU->getInstr();
+    IsTopNode ? (void)TheTracker.advance(MI, true, DAG->getLIS())
+              : TheUpwardTracker.recede(*MI, true);
+  }
+
+  return GenericScheduler::schedNode(SU, IsTopNode);
+}
+
 GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
   assert(CurrentStage && CurrentStage != SchedStages.end());
   return *CurrentStage;
@@ -475,12 +521,13 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const {
 }
 
 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
-    const MachineSchedContext *C)
+    const MachineSchedContext *C, bool IsLegacyScheduler)
     : GCNSchedStrategy(C) {
   SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
   SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
   SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
+  GCNTrackers = GCNTrackers & !IsLegacyScheduler;
 }
 
 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
@@ -787,6 +834,20 @@ void GCNScheduleDAGMILive::runSchedStages() {
         continue;
       }
 
+      if (GCNTrackers) {
+        GCNDownwardRPTracker *TheTracker = S.getTracker();
+        GCNUpwardRPTracker *TheUpwardTracker = S.getUpwardTracker();
+        GCNRPTracker::LiveRegSet *RegionLiveIns = &LiveIns[Stage->getRegionIdx()];
+
+        reinterpret_cast<GCNRPTracker *>(TheTracker)->reset(
+            Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
+            *RegionLiveIns);
+        reinterpret_cast<GCNRPTracker *>(TheUpwardTracker)->reset(
+            Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
+            RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx()));
+
+      }
+
       ScheduleDAGMILive::schedule();
       Stage->finalizeGCNRegion();
     }
@@ -1057,6 +1118,7 @@ void GCNSchedStage::finalizeGCNRegion() {
 void GCNSchedStage::checkScheduling() {
   // Check the results of scheduling.
   PressureAfter = DAG.getRealRegPressure(RegionIdx);
+
   LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
 
@@ -1608,9 +1670,6 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
     MachineInstr *MI = Entry.first;
     MachineInstr *OldMI = Entry.second;
 
-    // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
-    DAG.BBLiveInMap.erase(OldMI);
-
     // Remove OldMI and update LIS
     Register Reg = MI->getOperand(0).getReg();
     LIS->RemoveMachineInstrFromMaps(*OldMI);
@@ -1628,6 +1687,11 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
   DAG.Regions = NewRegions;
   DAG.RescheduleRegions = NewRescheduleRegions;
 
+  DAG.BBLiveInMap = DAG.getBBLiveInMap();
+
+  if (GCNTrackers)
+    DAG.RegionLiveOuts.buildLiveRegMap();
+
   SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
   MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index c402fb1ef373c9..8088339fbd26c2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -70,6 +70,12 @@ class GCNSchedStrategy : public GenericScheduler {
   // Pointer to the current SchedStageID.
   SmallVectorImpl<GCNSchedStageID>::iterator CurrentStage = nullptr;
 
+  // GCN RP Tracker for top-down scheduling
+  mutable GCNDownwardRPTracker TheTracker;
+
+  // GCN RP Tracker for botttom-up scheduling
+  mutable GCNUpwardRPTracker TheUpwardTracker;
+
 public:
   // schedule() have seen register pressure over the critical limits and had to
   // track register pressure for actual scheduling heuristics.
@@ -102,6 +108,8 @@ class GCNSchedStrategy : public GenericScheduler {
 
   SUnit *pickNode(bool &IsTopNode) override;
 
+  void schedNode(SUnit *SU, bool IsTopNode) override;
+
   void initialize(ScheduleDAGMI *DAG) override;
 
   unsigned getTargetOccupancy() { return TargetOccupancy; }
@@ -116,13 +124,19 @@ class GCNSchedStrategy : public GenericScheduler {
   bool hasNextStage() const;
 
   GCNSchedStageID getNextStage() const;
+
+  GCNDownwardRPTracker *getTracker() { return &TheTracker; }
+
+  GCNUpwardRPTracker *getUpwardTracker() { return &TheUpwardTracker; }
+
 };
 
 /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e.
 /// maximum number of waves per simd).
 class GCNMaxOccupancySchedStrategy final : public GCNSchedStrategy {
 public:
-  GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
+  GCNMaxOccupancySchedStrategy(const MachineSchedContext *C,
+                               bool IsLegacyScheduler = false);
 };
 
 /// The goal of this scheduling strategy is to maximize ILP for a single wave
@@ -350,6 +364,9 @@ class GCNSchedStage {
   bool isRegionWithExcessRP() const {
     return DAG.RegionsWithExcessRP[RegionIdx];
   }
+  
+  // The region number this stage is currently working on
+  unsigned getRegionIdx() { return RegionIdx; }
 
   // Returns true if the new schedule may result in more spilling.
   bool mayCauseSpilling(unsigned WavesAfter);

>From 015cf529e7dee05570bee20bf1f069fbf36ec8f0 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 14 Jun 2024 14:46:28 -0700
Subject: [PATCH 04/26] Formatting

Change-Id: I1cb0a88e94f4156da6118fcd3724556939351c6d
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 46 +++++++++++----------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h   |  3 +-
 2 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 1e6d95d128709d..a6115afe0f03ce 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -66,9 +66,8 @@ static cl::opt<bool> GCNTrackers(
 const unsigned ScheduleMetrics::ScaleFactor = 100;
 
 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
-    : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
-      TheTracker(*C->LIS), TheUpwardTracker(*C->LIS),
-      HasHighPressure(false) {}
+    : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), TheTracker(*C->LIS),
+      TheUpwardTracker(*C->LIS), HasHighPressure(false) {}
 
 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
   GenericScheduler::initialize(DAG);
@@ -175,8 +174,10 @@ static void getRegisterPressures(bool AtTop,
       auto MI = SU->getInstr();
       TempTopTracker.advance(MI, true, DAG->getLIS());
 
-      Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempTopTracker.getPressure().getSGPRNum();
-      Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = TempTopTracker.getPressure().getVGPRNum(false);
+      Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
+          TempTopTracker.getPressure().getSGPRNum();
+      Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
+          TempTopTracker.getPressure().getVGPRNum(false);
     }
 
     else {
@@ -184,8 +185,10 @@ static void getRegisterPressures(bool AtTop,
       auto MI = SU->getInstr();
       TempBotTracker.recede(*MI, true);
 
-      Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempBotTracker.getPressure().getSGPRNum();
-      Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = TempBotTracker.getPressure().getVGPRNum(false);
+      Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
+          TempBotTracker.getPressure().getSGPRNum();
+      Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
+          TempBotTracker.getPressure().getVGPRNum(false);
     }
   }
 }
@@ -217,7 +220,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
   // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
   // PressureDiffs.
   if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
-    getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, TheTracker, TheUpwardTracker, DAG);
+    getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
+                         TheTracker, TheUpwardTracker, DAG);
   } else {
     // Reserve 4 slots.
     Pressure.resize(4, 0);
@@ -235,11 +239,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
 
 #ifdef EXPENSIVE_CHECKS
     std::vector<unsigned> CheckPressure, CheckMaxPressure;
-<<<<<<< HEAD
-    getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure);
-=======
-    getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,TheTracker,TheUpwardTracker, DAG);
->>>>>>> 3fc6929b4a78... [AMDGPU] Optionally Use AMDGPU RPTrackers during scheduling
+    getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
+                         TheTracker, TheUpwardTracker, DAG);
     if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
             CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
         Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
@@ -837,15 +838,16 @@ void GCNScheduleDAGMILive::runSchedStages() {
       if (GCNTrackers) {
         GCNDownwardRPTracker *TheTracker = S.getTracker();
         GCNUpwardRPTracker *TheUpwardTracker = S.getUpwardTracker();
-        GCNRPTracker::LiveRegSet *RegionLiveIns = &LiveIns[Stage->getRegionIdx()];
-
-        reinterpret_cast<GCNRPTracker *>(TheTracker)->reset(
-            Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
-            *RegionLiveIns);
-        reinterpret_cast<GCNRPTracker *>(TheUpwardTracker)->reset(
-            Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
-            RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx()));
-
+        GCNRPTracker::LiveRegSet *RegionLiveIns =
+            &LiveIns[Stage->getRegionIdx()];
+
+        reinterpret_cast<GCNRPTracker *>(TheTracker)
+            ->reset(Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
+                    *RegionLiveIns);
+        reinterpret_cast<GCNRPTracker *>(TheUpwardTracker)
+            ->reset(
+                Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
+                RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx()));
       }
 
       ScheduleDAGMILive::schedule();
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 8088339fbd26c2..e8c89b2f1baf27 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -128,7 +128,6 @@ class GCNSchedStrategy : public GenericScheduler {
   GCNDownwardRPTracker *getTracker() { return &TheTracker; }
 
   GCNUpwardRPTracker *getUpwardTracker() { return &TheUpwardTracker; }
-
 };
 
 /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e.
@@ -364,7 +363,7 @@ class GCNSchedStage {
   bool isRegionWithExcessRP() const {
     return DAG.RegionsWithExcessRP[RegionIdx];
   }
-  
+
   // The region number this stage is currently working on
   unsigned getRegionIdx() { return RegionIdx; }
 

>From 15d90fb9a27444800fc23e0dc3972b68e784d97e Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 27 May 2024 10:43:43 -0700
Subject: [PATCH 05/26] Actually use the iterative trackers

Change-Id: I198925f5ed91b0a49ac265e19fdbe2208139f09a
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index a6115afe0f03ce..320acbaf5b22a6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -172,7 +172,7 @@ static void getRegisterPressures(bool AtTop,
     if (AtTop) {
       GCNDownwardRPTracker TempTopTracker(TheTracker);
       auto MI = SU->getInstr();
-      TempTopTracker.advance(MI, true, DAG->getLIS());
+      TempTopTracker.advance(MI, false, DAG->getLIS());
 
       Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
           TempTopTracker.getPressure().getSGPRNum();
@@ -183,7 +183,7 @@ static void getRegisterPressures(bool AtTop,
     else {
       GCNUpwardRPTracker TempBotTracker(TheUpwardTracker);
       auto MI = SU->getInstr();
-      TempBotTracker.recede(*MI, true);
+      TempBotTracker.recede(*MI, false);
 
       Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
           TempBotTracker.getPressure().getSGPRNum();
@@ -489,8 +489,8 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
 void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
   if (GCNTrackers) {
     MachineInstr *MI = SU->getInstr();
-    IsTopNode ? (void)TheTracker.advance(MI, true, DAG->getLIS())
-              : TheUpwardTracker.recede(*MI, true);
+    IsTopNode ? (void)TheTracker.advance(MI, false, DAG->getLIS())
+              : TheUpwardTracker.recede(*MI, false);
   }
 
   return GenericScheduler::schedNode(SU, IsTopNode);

>From 9e3362c91e86ab23c7f3a94000a125cfca500032 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 28 May 2024 13:24:09 -0700
Subject: [PATCH 06/26] Review Comments

Change-Id: Ifa69110bf0a239ea14d25c0bad03215d1b018656
---
 .../Target/AMDGPU/GCNIterativeScheduler.cpp   |  2 +-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 51 +++++++++----------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     |  8 +--
 3 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 9b1db3241e4327..e89016b0ae984e 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -480,7 +480,7 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
   LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
                        "target occupancy = "
                     << TgtOcc << '\n');
-  GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler*/ true);
+  GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler=*/ true);
   unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
 
   for (int I = 0; I < NumPasses; ++I) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 320acbaf5b22a6..e4d32b6eefb9b1 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -66,8 +66,8 @@ static cl::opt<bool> GCNTrackers(
 const unsigned ScheduleMetrics::ScaleFactor = 100;
 
 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
-    : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), TheTracker(*C->LIS),
-      TheUpwardTracker(*C->LIS), HasHighPressure(false) {}
+    : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), DownwardTracker(*C->LIS),
+      UpwardTracker(*C->LIS), HasHighPressure(false) {}
 
 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
   GenericScheduler::initialize(DAG);
@@ -157,8 +157,8 @@ static void getRegisterPressures(bool AtTop,
                                  const RegPressureTracker &RPTracker, SUnit *SU,
                                  std::vector<unsigned> &Pressure,
                                  std::vector<unsigned> &MaxPressure,
-                                 GCNDownwardRPTracker &TheTracker,
-                                 GCNUpwardRPTracker &TheUpwardTracker,
+                                 GCNDownwardRPTracker &DownwardTracker,
+                                 GCNUpwardRPTracker &UpwardTracker,
                                  ScheduleDAGMI *DAG) {
   // getDownwardPressure() and getUpwardPressure() make temporary changes to
   // the tracker, so we need to pass those function a non-const copy.
@@ -170,7 +170,7 @@ static void getRegisterPressures(bool AtTop,
       TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
   } else {
     if (AtTop) {
-      GCNDownwardRPTracker TempTopTracker(TheTracker);
+      GCNDownwardRPTracker TempTopTracker(DownwardTracker);
       auto MI = SU->getInstr();
       TempTopTracker.advance(MI, false, DAG->getLIS());
 
@@ -181,7 +181,7 @@ static void getRegisterPressures(bool AtTop,
     }
 
     else {
-      GCNUpwardRPTracker TempBotTracker(TheUpwardTracker);
+      GCNUpwardRPTracker TempBotTracker(UpwardTracker);
       auto MI = SU->getInstr();
       TempBotTracker.recede(*MI, false);
 
@@ -221,7 +221,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
   // PressureDiffs.
   if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
     getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
-                         TheTracker, TheUpwardTracker, DAG);
+                         DownwardTracker, UpwardTracker, DAG);
   } else {
     // Reserve 4 slots.
     Pressure.resize(4, 0);
@@ -240,7 +240,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
 #ifdef EXPENSIVE_CHECKS
     std::vector<unsigned> CheckPressure, CheckMaxPressure;
     getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
-                         TheTracker, TheUpwardTracker, DAG);
+                         TheTracker, UpwardTracker, DAG);
     if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
             CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
         Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
@@ -330,13 +330,13 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
   if (DAG->isTrackingPressure()) {
     SGPRPressure =
         GCNTrackers
-            ? (Zone.isTop() ? TheTracker.getPressure().getSGPRNum()
-                            : TheUpwardTracker.getPressure().getSGPRNum())
+            ? (Zone.isTop() ? DownwardTracker.getPressure().getSGPRNum()
+                            : UpwardTracker.getPressure().getSGPRNum())
             : Pressure[AMDGPU::RegisterPressureSets::SReg_32];
     VGPRPressure =
         GCNTrackers
-            ? (Zone.isTop() ? TheTracker.getPressure().getVGPRNum(false)
-                            : TheUpwardTracker.getPressure().getVGPRNum(false))
+            ? (Zone.isTop() ? DownwardTracker.getPressure().getVGPRNum(false)
+                            : UpwardTracker.getPressure().getVGPRNum(false))
             : Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
   }
   ReadyQueue &Q = Zone.Available;
@@ -489,8 +489,8 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
 void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
   if (GCNTrackers) {
     MachineInstr *MI = SU->getInstr();
-    IsTopNode ? (void)TheTracker.advance(MI, false, DAG->getLIS())
-              : TheUpwardTracker.recede(*MI, false);
+    IsTopNode ? (void)DownwardTracker.advance(MI, false, DAG->getLIS())
+              : UpwardTracker.recede(*MI, false);
   }
 
   return GenericScheduler::schedNode(SU, IsTopNode);
@@ -836,18 +836,17 @@ void GCNScheduleDAGMILive::runSchedStages() {
       }
 
       if (GCNTrackers) {
-        GCNDownwardRPTracker *TheTracker = S.getTracker();
-        GCNUpwardRPTracker *TheUpwardTracker = S.getUpwardTracker();
-        GCNRPTracker::LiveRegSet *RegionLiveIns =
-            &LiveIns[Stage->getRegionIdx()];
-
-        reinterpret_cast<GCNRPTracker *>(TheTracker)
-            ->reset(Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
-                    *RegionLiveIns);
-        reinterpret_cast<GCNRPTracker *>(TheUpwardTracker)
-            ->reset(
-                Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
-                RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx()));
+        GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
+        GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
+        GCNRPTracker::LiveRegSet *RegionLiveIns = &LiveIns[Stage->getRegionIdx()];
+
+        reinterpret_cast<GCNRPTracker *>(DownwardTracker)->reset(
+            Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
+            *RegionLiveIns);
+        reinterpret_cast<GCNRPTracker *>(UpwardTracker)->reset(
+            Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
+            RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx()));
+
       }
 
       ScheduleDAGMILive::schedule();
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index e8c89b2f1baf27..91b4c0c63d2bb3 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -71,10 +71,10 @@ class GCNSchedStrategy : public GenericScheduler {
   SmallVectorImpl<GCNSchedStageID>::iterator CurrentStage = nullptr;
 
   // GCN RP Tracker for top-down scheduling
-  mutable GCNDownwardRPTracker TheTracker;
+  mutable GCNDownwardRPTracker DownwardTracker;
 
   // GCN RP Tracker for botttom-up scheduling
-  mutable GCNUpwardRPTracker TheUpwardTracker;
+  mutable GCNUpwardRPTracker UpwardTracker;
 
 public:
   // schedule() have seen register pressure over the critical limits and had to
@@ -125,9 +125,9 @@ class GCNSchedStrategy : public GenericScheduler {
 
   GCNSchedStageID getNextStage() const;
 
-  GCNDownwardRPTracker *getTracker() { return &TheTracker; }
+  GCNDownwardRPTracker *getDownwardTracker() { return &DownwardTracker; }
 
-  GCNUpwardRPTracker *getUpwardTracker() { return &TheUpwardTracker; }
+  GCNUpwardRPTracker *getUpwardTracker() { return &UpwardTracker; }
 };
 
 /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e.

>From e583efa06e999342104df9e1a2fb4d9bb5f64641 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 28 May 2024 13:29:41 -0700
Subject: [PATCH 07/26] Use DAG.MRI

Change-Id: I9f0275a0cede9e77dfd29262124f2a856f436c8c
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index e4d32b6eefb9b1..c3bee344764160 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -840,13 +840,11 @@ void GCNScheduleDAGMILive::runSchedStages() {
         GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
         GCNRPTracker::LiveRegSet *RegionLiveIns = &LiveIns[Stage->getRegionIdx()];
 
-        reinterpret_cast<GCNRPTracker *>(DownwardTracker)->reset(
-            Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
-            *RegionLiveIns);
-        reinterpret_cast<GCNRPTracker *>(UpwardTracker)->reset(
-            Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
-            RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx()));
-
+        reinterpret_cast<GCNRPTracker *>(DownwardTracker)
+            ->reset(MRI, *RegionLiveIns);
+        reinterpret_cast<GCNRPTracker *>(UpwardTracker)
+            ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
+                             Stage->getRegionIdx()));
       }
 
       ScheduleDAGMILive::schedule();

>From a8396a4b343dca0f37faafadf63ce32191b0d55f Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 28 May 2024 13:52:29 -0700
Subject: [PATCH 08/26] Formatting

Change-Id: I74c19a2cf20d2325178933f81e0e8716d7c62f17
---
 llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp |  2 +-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp      | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index e89016b0ae984e..da065e8d8cb6b8 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -480,7 +480,7 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
   LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
                        "target occupancy = "
                     << TgtOcc << '\n');
-  GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler=*/ true);
+  GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler=*/true);
   unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
 
   for (int I = 0; I < NumPasses; ++I) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index c3bee344764160..724ffa4494323c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -66,8 +66,9 @@ static cl::opt<bool> GCNTrackers(
 const unsigned ScheduleMetrics::ScaleFactor = 100;
 
 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
-    : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), DownwardTracker(*C->LIS),
-      UpwardTracker(*C->LIS), HasHighPressure(false) {}
+    : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
+      DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
+}
 
 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
   GenericScheduler::initialize(DAG);
@@ -329,10 +330,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
   unsigned VGPRPressure = 0;
   if (DAG->isTrackingPressure()) {
     SGPRPressure =
-        GCNTrackers
-            ? (Zone.isTop() ? DownwardTracker.getPressure().getSGPRNum()
-                            : UpwardTracker.getPressure().getSGPRNum())
-            : Pressure[AMDGPU::RegisterPressureSets::SReg_32];
+        GCNTrackers ? (Zone.isTop() ? DownwardTracker.getPressure().getSGPRNum()
+                                    : UpwardTracker.getPressure().getSGPRNum())
+                    : Pressure[AMDGPU::RegisterPressureSets::SReg_32];
     VGPRPressure =
         GCNTrackers
             ? (Zone.isTop() ? DownwardTracker.getPressure().getVGPRNum(false)
@@ -838,7 +838,8 @@ void GCNScheduleDAGMILive::runSchedStages() {
       if (GCNTrackers) {
         GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
         GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
-        GCNRPTracker::LiveRegSet *RegionLiveIns = &LiveIns[Stage->getRegionIdx()];
+        GCNRPTracker::LiveRegSet *RegionLiveIns =
+            &LiveIns[Stage->getRegionIdx()];
 
         reinterpret_cast<GCNRPTracker *>(DownwardTracker)
             ->reset(MRI, *RegionLiveIns);

>From 349cb7ea8dfea366b50edfe3ae2270fb38c0f8f0 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 14 Jun 2024 15:03:02 -0700
Subject: [PATCH 09/26] Review comments

Change-Id: I09f9ca74c07b516daed0e93a85937df8b9aa922b
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 724ffa4494323c..5006ea37e2564b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -329,15 +329,16 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
   unsigned SGPRPressure = 0;
   unsigned VGPRPressure = 0;
   if (DAG->isTrackingPressure()) {
-    SGPRPressure =
-        GCNTrackers ? (Zone.isTop() ? DownwardTracker.getPressure().getSGPRNum()
-                                    : UpwardTracker.getPressure().getSGPRNum())
-                    : Pressure[AMDGPU::RegisterPressureSets::SReg_32];
-    VGPRPressure =
-        GCNTrackers
-            ? (Zone.isTop() ? DownwardTracker.getPressure().getVGPRNum(false)
-                            : UpwardTracker.getPressure().getVGPRNum(false))
-            : Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+    if (!GCNTrackers) {
+      SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
+      VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+    } else {
+      GCNRPTracker *T = &UpwardTracker;
+      if (Zone.isTop())
+        T = &DownwardTracker;
+      SGPRPressure = T->getPressure().getSGPRNum();
+      VGPRPressure = T->getPressure().getVGPRNum(false);
+    }
   }
   ReadyQueue &Q = Zone.Available;
   for (SUnit *SU : Q) {

>From 01beddbec69dea1febf6c226d5dee86817bd5324 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 14 Jun 2024 16:14:57 -0700
Subject: [PATCH 10/26] Allocate Pressure vector

Change-Id: I5effce973fa2d945076e89b4453a844f0fc85fc9
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 5006ea37e2564b..cdafa01eeb857a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -170,6 +170,7 @@ static void getRegisterPressures(bool AtTop,
     else
       TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
   } else {
+    Pressure.resize(4, 0);
     if (AtTop) {
       GCNDownwardRPTracker TempTopTracker(DownwardTracker);
       auto MI = SU->getInstr();

>From 4cad2b9d99ae64b4ad8b752ed14cd37a05be8a24 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 18 Jun 2024 11:39:48 -0700
Subject: [PATCH 11/26] Remove flag from upward RPTracker

Change-Id: I6217c03f56d34f584e5b23cf7c4462842bc7173b
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index cdafa01eeb857a..0c7639462905d7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -185,7 +185,7 @@ static void getRegisterPressures(bool AtTop,
     else {
       GCNUpwardRPTracker TempBotTracker(UpwardTracker);
       auto MI = SU->getInstr();
-      TempBotTracker.recede(*MI, false);
+      TempBotTracker.recede(*MI);
 
       Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
           TempBotTracker.getPressure().getSGPRNum();
@@ -492,7 +492,7 @@ void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
   if (GCNTrackers) {
     MachineInstr *MI = SU->getInstr();
     IsTopNode ? (void)DownwardTracker.advance(MI, false, DAG->getLIS())
-              : UpwardTracker.recede(*MI, false);
+              : UpwardTracker.recede(*MI);
   }
 
   return GenericScheduler::schedNode(SU, IsTopNode);

>From cb5c92603ea2967ddcb2dcc936029f02da237e97 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 19 Jun 2024 11:45:32 -0700
Subject: [PATCH 12/26] Review comments

Change-Id: Ibeaba6cab034636472b20c36adfadabbbc2c19ef
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 53 ++++++++++-----------
 1 file changed, 25 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 0c7639462905d7..c5d217d80a7c8a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -165,33 +165,30 @@ static void getRegisterPressures(bool AtTop,
   // the tracker, so we need to pass those function a non-const copy.
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
   if (!GCNTrackers) {
-    if (AtTop)
-      TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
-    else
-      TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
-  } else {
-    Pressure.resize(4, 0);
-    if (AtTop) {
-      GCNDownwardRPTracker TempTopTracker(DownwardTracker);
-      auto MI = SU->getInstr();
-      TempTopTracker.advance(MI, false, DAG->getLIS());
-
-      Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
-          TempTopTracker.getPressure().getSGPRNum();
-      Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
-          TempTopTracker.getPressure().getVGPRNum(false);
-    }
+    AtTop
+        ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
+        : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
 
-    else {
-      GCNUpwardRPTracker TempBotTracker(UpwardTracker);
-      auto MI = SU->getInstr();
-      TempBotTracker.recede(*MI);
+    return;
+  }
 
-      Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
-          TempBotTracker.getPressure().getSGPRNum();
-      Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
-          TempBotTracker.getPressure().getVGPRNum(false);
-    }
+  // GCNTrackers
+  Pressure.resize(4, 0);
+  MachineInstr *MI = SU->getInstr();
+  if (AtTop) {
+    GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
+    TempDownwardTracker.advance(MI, false, DAG->getLIS());
+    Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
+        TempDownwardTracker.getPressure().getSGPRNum();
+    Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
+        TempDownwardTracker.getPressure().getVGPRNum(false);
+  } else {
+    GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
+    TempUpwardTracker.recede(*MI);
+    Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
+        TempUpwardTracker.getPressure().getSGPRNum();
+    Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
+        TempUpwardTracker.getPressure().getVGPRNum(false);
   }
 }
 
@@ -334,9 +331,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
       SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
       VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
     } else {
-      GCNRPTracker *T = &UpwardTracker;
-      if (Zone.isTop())
-        T = &DownwardTracker;
+      GCNRPTracker *T = Zone.isTop()
+                            ? static_cast<GCNRPTracker *>(&UpwardTracker)
+                            : static_cast<GCNRPTracker *>(&DownwardTracker);
       SGPRPressure = T->getPressure().getSGPRNum();
       VGPRPressure = T->getPressure().getVGPRNum(false);
     }

>From 80534d30e23b60d16e87e19ee8b9276dd8a0a88c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 20 Jun 2024 08:49:26 -0700
Subject: [PATCH 13/26] Dont modify existing PreRARematStage LiveIn handling

Change-Id: I96c99f12c59ef0eea86f7fbf134913ecc47dd6f2
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index c5d217d80a7c8a..d48e33f7df950a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1669,6 +1669,9 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
     MachineInstr *MI = Entry.first;
     MachineInstr *OldMI = Entry.second;
 
+    // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
+    DAG.BBLiveInMap.erase(OldMI);
+
     // Remove OldMI and update LIS
     Register Reg = MI->getOperand(0).getReg();
     LIS->RemoveMachineInstrFromMaps(*OldMI);
@@ -1686,8 +1689,6 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
   DAG.Regions = NewRegions;
   DAG.RescheduleRegions = NewRescheduleRegions;
 
-  DAG.BBLiveInMap = DAG.getBBLiveInMap();
-
   if (GCNTrackers)
     DAG.RegionLiveOuts.buildLiveRegMap();
 

>From 28b520d4a3c18e58b865586571373c9fbe7cf687 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 12 Aug 2024 13:55:44 -0700
Subject: [PATCH 14/26] Use GCNTracker RP speculation

Change-Id: I3e893ca2ffcf1032fe157b537c9563565215b123
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index d48e33f7df950a..7ce8d8c56baf56 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -177,18 +177,18 @@ static void getRegisterPressures(bool AtTop,
   MachineInstr *MI = SU->getInstr();
   if (AtTop) {
     GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
-    TempDownwardTracker.advance(MI, false, DAG->getLIS());
+    TempDownwardTracker.bumpDownwardPressure(MI);
     Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
         TempDownwardTracker.getPressure().getSGPRNum();
     Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
-        TempDownwardTracker.getPressure().getVGPRNum(false);
+        TempDownwardTracker.getPressure().getArchVGPRNum();
   } else {
     GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
-    TempUpwardTracker.recede(*MI);
+    TempUpwardTracker.bumpUpwardPressure(MI);
     Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
         TempUpwardTracker.getPressure().getSGPRNum();
     Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
-        TempUpwardTracker.getPressure().getVGPRNum(false);
+        TempUpwardTracker.getPressure().getArchVGPRNum();
   }
 }
 

>From de185daec81d9ca82bb84bbc09ac68244ebad139 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 20 Aug 2024 12:29:33 -0700
Subject: [PATCH 15/26] Port changes from pull/93088

Change-Id: I2de464b32d3c6ed9a77cbbc669d735dde63c2e47
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 45 +++++++++++++----------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index d1a50adc1918cf..cbcef5faf21ed9 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -299,11 +299,11 @@ static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
   return I->LaneMask;
 }
 
-static LaneBitmask
-getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
-                     bool TrackLaneMasks, Register RegUnit, SlotIndex Pos,
-                     LaneBitmask SafeDefault,
-                     bool (*Property)(const LiveRange &LR, SlotIndex Pos)) {
+static LaneBitmask getLanesWithProperty(
+    const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
+    bool TrackLaneMasks, Register RegUnit, SlotIndex Pos,
+    LaneBitmask SafeDefault,
+    function_ref<bool(const LiveRange &LR, SlotIndex Pos)> Property) {
   if (RegUnit.isVirtual()) {
     const LiveInterval &LI = LIS.getInterval(RegUnit);
     LaneBitmask Result;
@@ -318,14 +318,14 @@ getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
     }
 
     return Result;
-  } else {
-    const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
-    // Be prepared for missing liveranges: We usually do not compute liveranges
-    // for physical registers on targets with many registers (GPUs).
-    if (LR == nullptr)
-      return SafeDefault;
-    return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone();
   }
+
+  const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
+  // Be prepared for missing liveranges: We usually do not compute liveranges
+  // for physical registers on targets with many registers (GPUs).
+  if (LR == nullptr)
+    return SafeDefault;
+  return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone();
 }
 
 /// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
@@ -334,19 +334,21 @@ getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
 static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
                                   SlotIndex PriorUseIdx, SlotIndex NextUseIdx,
                                   const MachineRegisterInfo &MRI,
+                                  const SIRegisterInfo *TRI,
                                   const LiveIntervals *LIS,
                                   bool Upward = false) {
-  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
-  for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+  for (const MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) {
     if (MO.isUndef())
       continue;
+    if (!MO.readsReg())
+      continue;
     const MachineInstr *MI = MO.getParent();
     SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
     bool InRange = Upward ? (InstSlot > PriorUseIdx && InstSlot <= NextUseIdx)
                           : (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx);
     if (InRange) {
       unsigned SubRegIdx = MO.getSubReg();
-      LaneBitmask UseMask = TRI.getSubRegIndexLaneMask(SubRegIdx);
+      LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(SubRegIdx);
       LastUseMask &= ~UseMask;
       if (LastUseMask.none())
         return LaneBitmask::getNone();
@@ -518,7 +520,9 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI) {
 
   // Account for register pressure similar to RegPressureTracker::recede().
   RegisterOperands RegOpers;
-  const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+
+  const SIRegisterInfo *TRI =
+      MI->getMF()->getSubtarget<GCNSubtarget>().getRegisterInfo();
   RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/true);
   assert(RegOpers.DeadDefs.empty());
   RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
@@ -559,8 +563,8 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI) {
         LastTrackedMI ? LIS.getInstructionIndex(*LastTrackedMI).getRegSlot()
                       : LIS.getMBBEndIdx(MI->getParent());
     ;
-    LaneBitmask LastUseMask =
-        findUseBetween(Reg, P.LaneMask, SlotIdx, CurrIdx, *MRI, &LIS, true);
+    LaneBitmask LastUseMask = findUseBetween(Reg, P.LaneMask, SlotIdx, CurrIdx,
+                                             *MRI, TRI, &LIS, true);
     LastUseMask &= ~LiveAfter;
     LaneBitmask LiveBefore = (LiveAfter | LastUseMask);
     CurPressure.inc(Reg, LiveAfter, LiveBefore, *MRI);
@@ -734,7 +738,8 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI) {
 
   // Account for register pressure similar to RegPressureTracker::recede().
   RegisterOperands RegOpers;
-  const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+  const SIRegisterInfo *TRI =
+      MI->getMF()->getSubtarget<GCNSubtarget>().getRegisterInfo();
   RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/false);
   RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
 
@@ -761,7 +766,7 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI) {
     }
 
     LastUseMask =
-        findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, &LIS);
+        findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, TRI, &LIS);
     if (LastUseMask.none())
       continue;
 

>From ad2e468853ad93265a1a7206469472223f6ac854 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 21 Aug 2024 15:16:05 -0700
Subject: [PATCH 16/26] Port changes from pull/93088

---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 16 +++++-----------
 llvm/lib/Target/AMDGPU/GCNRegPressure.h   |  4 ++--
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index cbcef5faf21ed9..58d34546b9e748 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -321,8 +321,6 @@ static LaneBitmask getLanesWithProperty(
   }
 
   const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
-  // Be prepared for missing liveranges: We usually do not compute liveranges
-  // for physical registers on targets with many registers (GPUs).
   if (LR == nullptr)
     return SafeDefault;
   return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone();
@@ -337,11 +335,9 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
                                   const SIRegisterInfo *TRI,
                                   const LiveIntervals *LIS,
                                   bool Upward = false) {
-  for (const MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) {
+  for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
     if (MO.isUndef())
       continue;
-    if (!MO.readsReg())
-      continue;
     const MachineInstr *MI = MO.getParent();
     SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
     bool InRange = Upward ? (InstSlot > PriorUseIdx && InstSlot <= NextUseIdx)
@@ -513,7 +509,8 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   assert(CurPressure == getRegPressure(*MRI, LiveRegs));
 }
 
-void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI) {
+void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI,
+                                            const SIRegisterInfo *TRI) {
   assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
 
   SlotIndex SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot();
@@ -521,8 +518,6 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI) {
   // Account for register pressure similar to RegPressureTracker::recede().
   RegisterOperands RegOpers;
 
-  const SIRegisterInfo *TRI =
-      MI->getMF()->getSubtarget<GCNSubtarget>().getRegisterInfo();
   RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/true);
   assert(RegOpers.DeadDefs.empty());
   RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
@@ -730,7 +725,8 @@ Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
   });
 }
 
-void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI) {
+void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
+                                                const SIRegisterInfo *TRI) {
   assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
 
   SlotIndex SlotIdx;
@@ -738,8 +734,6 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI) {
 
   // Account for register pressure similar to RegPressureTracker::recede().
   RegisterOperands RegOpers;
-  const SIRegisterInfo *TRI =
-      MI->getMF()->getSubtarget<GCNSubtarget>().getRegisterInfo();
   RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/false);
   RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
 
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index f78e4d7da0a1dd..5f9434f91efc64 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -222,7 +222,7 @@ class GCNUpwardRPTracker : public GCNRPTracker {
   /// to false allows for an externally managed iterator / program order.
   void recede(const MachineInstr &MI);
 
-  void bumpUpwardPressure(const MachineInstr *MI);
+  void bumpUpwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI);
 
   /// \p returns whether the tracker's state after receding MI corresponds
   /// to reported by LIS.
@@ -306,7 +306,7 @@ class GCNDownwardRPTracker : public GCNRPTracker {
                MachineBasicBlock::const_iterator End,
                const LiveRegSet *LiveRegsCopy = nullptr);
 
-  void bumpDownwardPressure(const MachineInstr *MI);
+  void bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI);
 };
 
 LaneBitmask getLiveLaneMask(unsigned Reg,

>From 0ec89ac36cd8c36054e7a2edbec0c4c76c8f78ef Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 21 Aug 2024 15:34:33 -0700
Subject: [PATCH 17/26] Feed SIRegisterInfo to Trackers + Propagate unused AGPR
 speculative pressure + Use correct previous VGPR pressure

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 26 +++++++++++----------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 7ce8d8c56baf56..bf812e840b876c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -154,13 +154,11 @@ static bool canUsePressureDiffs(const SUnit &SU) {
   return true;
 }
 
-static void getRegisterPressures(bool AtTop,
-                                 const RegPressureTracker &RPTracker, SUnit *SU,
-                                 std::vector<unsigned> &Pressure,
-                                 std::vector<unsigned> &MaxPressure,
-                                 GCNDownwardRPTracker &DownwardTracker,
-                                 GCNUpwardRPTracker &UpwardTracker,
-                                 ScheduleDAGMI *DAG) {
+static void getRegisterPressures(
+    bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
+    std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
+    GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
+    ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
   // getDownwardPressure() and getUpwardPressure() make temporary changes to
   // the tracker, so we need to pass those function a non-const copy.
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
@@ -177,18 +175,22 @@ static void getRegisterPressures(bool AtTop,
   MachineInstr *MI = SU->getInstr();
   if (AtTop) {
     GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
-    TempDownwardTracker.bumpDownwardPressure(MI);
+    TempDownwardTracker.bumpDownwardPressure(MI, SRI);
     Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
         TempDownwardTracker.getPressure().getSGPRNum();
     Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
         TempDownwardTracker.getPressure().getArchVGPRNum();
+    Pressure[AMDGPU::RegisterPressureSets::AGPR_32] =
+        TempDownwardTracker.getPressure().getAGPRNum();
   } else {
     GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
-    TempUpwardTracker.bumpUpwardPressure(MI);
+    TempUpwardTracker.bumpUpwardPressure(MI, SRI);
     Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
         TempUpwardTracker.getPressure().getSGPRNum();
     Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
         TempUpwardTracker.getPressure().getArchVGPRNum();
+    Pressure[AMDGPU::RegisterPressureSets::AGPR_32] =
+        TempDownwardTracker.getPressure().getAGPRNum();
   }
 }
 
@@ -220,7 +222,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
   // PressureDiffs.
   if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
     getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
-                         DownwardTracker, UpwardTracker, DAG);
+                         DownwardTracker, UpwardTracker, DAG, SRI);
   } else {
     // Reserve 4 slots.
     Pressure.resize(4, 0);
@@ -239,7 +241,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
 #ifdef EXPENSIVE_CHECKS
     std::vector<unsigned> CheckPressure, CheckMaxPressure;
     getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
-                         TheTracker, UpwardTracker, DAG);
+                         TheTracker, UpwardTracker, DAG, SRI);
     if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
             CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
         Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
@@ -335,7 +337,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                             ? static_cast<GCNRPTracker *>(&UpwardTracker)
                             : static_cast<GCNRPTracker *>(&DownwardTracker);
       SGPRPressure = T->getPressure().getSGPRNum();
-      VGPRPressure = T->getPressure().getVGPRNum(false);
+      VGPRPressure = T->getPressure().getArchVGPRNum();
     }
   }
   ReadyQueue &Q = Zone.Available;

>From adcd2c741516bd4ea7b13dc8e5b58e73e0cdb2cc Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 5 Sep 2024 08:24:43 -0700
Subject: [PATCH 18/26] Review comments

Change-Id: I286c9ed1ae91a68da881c6fa27f5f391102d0a9c
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp   | 68 +++++++++++++--------
 llvm/lib/Target/AMDGPU/GCNRegPressure.h     | 11 ++++
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp |  2 +-
 3 files changed, 54 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 58d34546b9e748..c47d0c0d613dd5 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -289,6 +289,7 @@ collectVirtualRegUses(SmallVectorImpl<RegisterMaskPair> &RegMaskPairs,
   }
 }
 
+/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
 static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
                                Register RegUnit) {
   auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
@@ -299,6 +300,7 @@ static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
   return I->LaneMask;
 }
 
+/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
 static LaneBitmask getLanesWithProperty(
     const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
     bool TrackLaneMasks, Register RegUnit, SlotIndex Pos,
@@ -326,6 +328,7 @@ static LaneBitmask getLanesWithProperty(
   return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone();
 }
 
+/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
 /// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
 /// The query starts with a lane bitmask which gets lanes/bits removed for every
 /// use we find.
@@ -353,6 +356,35 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
   return LastUseMask;
 }
 
+/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
+static LaneBitmask getLiveLanesAt(const LiveIntervals &LIS,
+                                  const MachineRegisterInfo &MRI,
+                                  bool TrackLaneMasks, Register RegUnit,
+                                  SlotIndex Pos) {
+  return getLanesWithProperty(
+      LIS, MRI, TrackLaneMasks, RegUnit, Pos, LaneBitmask::getAll(),
+      [](const LiveRange &LR, SlotIndex Pos) { return LR.liveAt(Pos); });
+}
+
+// Copy/paste from RegisterPressure.cpp (RegisterOperands::adjustLaneLiveness)
+static void adjustDefLaneLiveness(SmallVectorImpl<RegisterMaskPair> &Defs,
+                                  SlotIndex &Pos, const LiveIntervals &LIS,
+                                  const MachineRegisterInfo &MRI) {
+  for (auto *I = Defs.begin(); I != Defs.end();) {
+    LaneBitmask LiveAfter =
+        getLiveLanesAt(LIS, MRI, true, I->RegUnit, Pos.getDeadSlot());
+    // If the def is all that is live after the instruction, then in case
+    // of a subregister def we need a read-undef flag.
+    LaneBitmask ActualDef = I->LaneMask & LiveAfter;
+    if (ActualDef.none()) {
+      I = Defs.erase(I);
+    } else {
+      I->LaneMask = ActualDef;
+      ++I;
+    }
+  }
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // GCNRPTracker
 
@@ -417,6 +449,7 @@ void GCNRPTracker::reset(const MachineRegisterInfo &MRI_,
 }
 
 void GCNRPTracker::bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs) {
+  GCNRegPressure TempPressure = CurPressure;
   for (const RegisterMaskPair &P : DeadDefs) {
     Register Reg = P.RegUnit;
     if (!Reg.isVirtual())
@@ -426,16 +459,9 @@ void GCNRPTracker::bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs) {
     CurPressure.inc(Reg, LiveMask, BumpedMask, *MRI);
   }
   MaxPressure = max(MaxPressure, CurPressure);
-  for (const RegisterMaskPair &P : DeadDefs) {
-    Register Reg = P.RegUnit;
-    if (!Reg.isVirtual())
-      continue;
-    LaneBitmask LiveMask = LiveRegs[Reg];
-    LaneBitmask BumpedMask = LiveMask | P.LaneMask;
-    CurPressure.inc(Reg, BumpedMask, LiveMask, *MRI);
-  }
+  CurPressure = TempPressure;
 }
-
+/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
 LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit,
                                            SlotIndex Pos) const {
   return getLanesWithProperty(
@@ -520,7 +546,7 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI,
 
   RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/true);
   assert(RegOpers.DeadDefs.empty());
-  RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
+  adjustDefLaneLiveness(RegOpers.Defs, SlotIdx, LIS, *MRI);
   RegOpers.detectDeadDefs(*MI, LIS);
 
   // Boost max pressure for all dead defs together.
@@ -537,11 +563,7 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI,
     LaneBitmask DefLanes = P.LaneMask;
     LaneBitmask LiveBefore = (LiveAfter & ~DefLanes) | UseLanes;
 
-    // There may be parts of the register that were dead before the
-    // instruction, but became live afterwards. Similarly, some parts
-    // may have been killed in this instruction.
     CurPressure.inc(Reg, LiveAfter, LiveAfter & LiveBefore, *MRI);
-    CurPressure.inc(Reg, LiveAfter, ~LiveAfter & LiveBefore, *MRI);
     MaxPressure = max(MaxPressure, CurPressure);
   }
   // Generate liveness for uses.
@@ -549,19 +571,8 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI,
     Register Reg = P.RegUnit;
     if (!Reg.isVirtual())
       continue;
-    // If this register was also in a def operand, we've handled it
-    // with defs.
-    if (getRegLanes(RegOpers.Defs, Reg).any())
-      continue;
     LaneBitmask LiveAfter = LiveRegs[Reg];
-    SlotIndex CurrIdx =
-        LastTrackedMI ? LIS.getInstructionIndex(*LastTrackedMI).getRegSlot()
-                      : LIS.getMBBEndIdx(MI->getParent());
-    ;
-    LaneBitmask LastUseMask = findUseBetween(Reg, P.LaneMask, SlotIdx, CurrIdx,
-                                             *MRI, TRI, &LIS, true);
-    LastUseMask &= ~LiveAfter;
-    LaneBitmask LiveBefore = (LiveAfter | LastUseMask);
+    LaneBitmask LiveBefore = LiveAfter | P.LaneMask;
     CurPressure.inc(Reg, LiveAfter, LiveBefore, *MRI);
   }
   MaxPressure = max(MaxPressure, CurPressure);
@@ -682,8 +693,13 @@ bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator,
                                    LiveIntervals *TheLIS) {
   if (UseInternalIterator && NextMI == MBBEnd)
     return false;
+
   advanceBeforeNext(MI, UseInternalIterator, TheLIS);
   advanceToNext(MI, UseInternalIterator);
+  if (!UseInternalIterator) {
+    // We must remove any dead def lanes from the current RP
+    advanceBeforeNext(MI, true, TheLIS);
+  }
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 5f9434f91efc64..463da472bb69ff 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -169,6 +169,7 @@ class GCNRPTracker {
   void reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy,
              bool After);
 
+  /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
   void bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs);
 
   LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const;
@@ -222,6 +223,11 @@ class GCNUpwardRPTracker : public GCNRPTracker {
   /// to false allows for an externally managed iterator / program order.
   void recede(const MachineInstr &MI);
 
+  /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
+  /// Calculate the impact \p MI will have on CurPressure and MaxPressure. This
+  /// does not rely on the implicit program ordering in the LiveIntervals to
+  /// support RP Speculation. It leaves the state of pressure inconsistent with
+  /// the current position
   void bumpUpwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI);
 
   /// \p returns whether the tracker's state after receding MI corresponds
@@ -306,6 +312,11 @@ class GCNDownwardRPTracker : public GCNRPTracker {
                MachineBasicBlock::const_iterator End,
                const LiveRegSet *LiveRegsCopy = nullptr);
 
+  /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
+  /// Calculate the impact \p MI will have on CurPressure and MaxPressure. This
+  /// does not rely on the implicit program ordering in the LiveIntervals to
+  /// support RP Speculation. It leaves the state of pressure inconsistent with
+  /// the current position
   void bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI);
 };
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index bf812e840b876c..651f25c80d60c7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -190,7 +190,7 @@ static void getRegisterPressures(
     Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
         TempUpwardTracker.getPressure().getArchVGPRNum();
     Pressure[AMDGPU::RegisterPressureSets::AGPR_32] =
-        TempDownwardTracker.getPressure().getAGPRNum();
+        TempUpwardTracker.getPressure().getAGPRNum();
   }
 }
 

>From 7e35229dc8080b5735f5b63513cffc0183676ff3 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 18 Sep 2024 12:59:36 -0700
Subject: [PATCH 19/26] Avoid const_cast

Change-Id: Ib7b21b2ab4cc44abc61fb8ad8880fb78f831619a
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index c47d0c0d613dd5..fb92924363d43b 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -599,15 +599,15 @@ bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI,
                                              LiveIntervals *TheLIS) {
   assert(MRI && "call reset first");
   SlotIndex SI;
-  LiveIntervals *CurrLIS;
-  MachineInstr *CurrMI;
+  const LiveIntervals *CurrLIS;
+  const MachineInstr *CurrMI;
   if (UseInternalIterator) {
     if (!LastTrackedMI)
       return NextMI == MBBEnd;
 
     assert(NextMI == MBBEnd || !NextMI->isDebugInstr());
-    CurrLIS = const_cast<LiveIntervals *>(&LIS);
-    CurrMI = const_cast<MachineInstr *>(LastTrackedMI);
+    CurrLIS = &LIS;
+    CurrMI = LastTrackedMI;
 
     SI = NextMI == MBBEnd
              ? CurrLIS->getInstructionIndex(*LastTrackedMI).getDeadSlot()
@@ -673,7 +673,7 @@ void GCNDownwardRPTracker::advanceToNext(MachineInstr *MI,
     LastTrackedMI = MI;
   }
 
-  MachineInstr *CurrMI = const_cast<MachineInstr *>(LastTrackedMI);
+  const MachineInstr *CurrMI = LastTrackedMI;
 
   // Add new registers or mask bits.
   for (const auto &MO : CurrMI->all_defs()) {

>From af9f2200e025dcd1d2d54e6bbc96d3064409a516 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 23 Sep 2024 10:08:21 -0700
Subject: [PATCH 20/26] Fix shouldTrackVGPRs calculation

Change-Id: I3d0aae74f20927722cd6844b1d586ae7accab86e
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 651f25c80d60c7..28ca41d2dc96ed 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -333,7 +333,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
       SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
       VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
     } else {
-      GCNRPTracker *T = Zone.isTop()
+      GCNRPTracker *T = IsBottomUp
                             ? static_cast<GCNRPTracker *>(&UpwardTracker)
                             : static_cast<GCNRPTracker *>(&DownwardTracker);
       SGPRPressure = T->getPressure().getSGPRNum();

>From 94c8ba8c4163aabe7bef96e98722e84a3ca4d66c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 27 Sep 2024 12:40:02 -0700
Subject: [PATCH 21/26] Add lit tests

Change-Id: I228916bf04add1de7615294d1e58ee4213f0bbde
---
 .../CodeGen/AMDGPU/high-RP-reschedule.mir     |  10 +-
 llvm/test/CodeGen/AMDGPU/pr51516.mir          |   6 +-
 .../schedule-amdgpu-tracker-physreg-crash.ll  |  65 ++
 .../AMDGPU/schedule-amdgpu-tracker-physreg.ll | 491 +++++++++++++
 .../AMDGPU/schedule-amdgpu-trackers.ll        | 647 ++++++++++++++++++
 ...schedule-regpressure-ilp-metric-spills.mir |  15 +
 .../AMDGPU/schedule-relaxed-occupancy.ll      |  10 +-
 7 files changed, 1240 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll

diff --git a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir
index e9005e94ce5db7..d57450baea911a 100644
--- a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir
+++ b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir
@@ -1,11 +1,17 @@
 # REQUIRES: asserts
-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -amdgpu-use-amdgpu-trackers=1 -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN-GCNTRACKER %s
 
 --- |
   define amdgpu_kernel void @high-RP-reschedule() { ret void }
 ...
 
-# CHECK: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4
+# GCN: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4
+
+# GCN-GCNTRACKER: Occupancy before scheduling: 3, after 4.
+# GCN-GCNTRACKER-NEXT: Ending scheduling stage: Max Occupancy Initial Schedule
+
+# When using the GCN Trackers, the scheduler is able to acieve desired occupancy without running high-RP-reschedule stage.
 
 ---
 name: high-RP-reschedule
diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir
index 4be102f7860eab..49dd5c6c39ff5c 100644
--- a/llvm/test/CodeGen/AMDGPU/pr51516.mir
+++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir
@@ -1,4 +1,5 @@
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
 
 # Check that %3 was not rematerialized before the last store since its operand %1
 # is killed by that store.
@@ -7,6 +8,9 @@
 # GCN: renamable $vgpr33_vgpr34_vgpr35_vgpr36 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
 # GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr47, killed renamable $vgpr29_vgpr30_vgpr31_vgpr32, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr46
 
+# GCN-GCNTRACKER-LABEL: name: global_sextload_v32i32_to_v32i64
+# GCN-GCNTRACKER-NOT: SI_SPILL
+
 ---
 name:            global_sextload_v32i32_to_v32i64
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll
new file mode 100644
index 00000000000000..79187f51af0d2b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll
@@ -0,0 +1,65 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack -amdgpu-use-amdgpu-trackers=1  2>&1  < %s | FileCheck -check-prefixes=ERR-GCNTRACKERS %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack 2>&1  < %s | FileCheck -check-prefixes=GCN %s
+
+%asm.output = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs
+                     <16 x i32>, <7 x i32>, ; vgprs
+                     i64 ; vcc
+                     }
+
+%asm.output2 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs
+                     <16 x i32>, <5 x i32>, ; vgprs
+                     i64 ; vcc
+                     }
+
+%asm.output3 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, ; sgprs
+                     <16 x i32>, <6 x i32>, ; vgprs
+                     i64 ; vcc
+                     }
+
+; ERR-GCNTRACKERS: ran out of registers during register allocation
+; GCN-NOT: ran out of registers during register allocation
+
+; FIXME: GCN Trackers do not track pressure from PhysRegs, so scheduling is actually worse
+
+define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 {
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  %alloca1 = alloca i32, align 4, addrspace(5)
+  call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+
+  %asm = call %asm.output asm sideeffect
+    "; def $0, $1, $2, $3, $4, $5, $6, $7, $8",
+    "={s[0:15]},={s[16:31]},={s[32:47]},={s[48:55]},={s[56:57]},={s58},={v[0:15]},={v[16:22]},={vcc}"()
+
+  %s0 = extractvalue %asm.output %asm, 0
+  %s1 = extractvalue %asm.output %asm, 1
+  %s2 = extractvalue %asm.output %asm, 2
+  %s3 = extractvalue %asm.output %asm, 3
+  %s4 = extractvalue %asm.output %asm, 4
+  %s5 = extractvalue %asm.output %asm, 5
+
+  %v0 = extractvalue %asm.output %asm, 6
+  %v1 = extractvalue %asm.output %asm, 7
+
+  %vcc = extractvalue %asm.output %asm, 8
+
+  ; scc is unavailable since it is live in
+  call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10",
+                           "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:22]},{vcc},{s59},{scc}"(
+    <16 x i32> %s0,
+    <16 x i32> %s1,
+    <16 x i32> %s2,
+    <8 x i32> %s3,
+    <2 x i32> %s4,
+    i32 %s5,
+    <16 x i32> %v0,
+    <7 x i32> %v1,
+    i64 %vcc,
+    ptr addrspace(5) %alloca1,
+    i32 0) ; use of scc
+
+  ret void
+}
+
+attributes #0 = { nounwind alignstack=64 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" }
+attributes #1 = { nounwind alignstack=16 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" }
+
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
new file mode 100644
index 00000000000000..c490c76f4531de
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
@@ -0,0 +1,491 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0  < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 -amdgpu-use-amdgpu-trackers=1  < %s | FileCheck --check-prefix=GCN-GCNTRACKERS %s
+
+; CHECK-LABEL: {{^}}spill:
+; GCN:    codeLenInByte = 1000
+; GCN-GCNTRACKERS:    codeLenInByte = 1016
+; GCN:    NumSgprs: 104
+; GCN-GCNTRACKERS:    NumSgprs: 104
+; GCN:    NumVgprs: 1
+; GCN-GCNTRACKERS:    NumVgprs: 2
+; GCN:    ScratchSize: 0
+; GCN-GCNTRACKERS:    ScratchSize: 0
+; GCN:    Occupancy: 5
+; GCN-GCNTRACKERS:    Occupancy: 5
+
+; FIXME: GCN Trackers do not track pressure from PhysRegs, so scheduling is actually worse
+
+define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 {
+entry:
+  %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #0
+  %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #0
+  %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #0
+  %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #0
+  %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #0
+  %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #0
+  %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #0
+  %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #0
+  %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #0
+  %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #0
+  %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #0
+  %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #0
+  %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #0
+  %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #0
+  %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #0
+  %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #0
+  %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #0
+  %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #0
+  %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #0
+  %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #0
+  %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #0
+  %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #0
+  %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #0
+  %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #0
+  %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #0
+  %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #0
+  %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #0
+  %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #0
+  %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #0
+  %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #0
+  %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #0
+  %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #0
+  %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #0
+  %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #0
+  %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #0
+  %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #0
+  %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #0
+  %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #0
+  %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #0
+  %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #0
+  %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #0
+  %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #0
+  %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #0
+  %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #0
+  %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #0
+  %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #0
+  %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #0
+  %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #0
+  %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #0
+  %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #0
+  %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #0
+  %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #0
+  %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #0
+  %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #0
+  %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #0
+  %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #0
+  %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #0
+  %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #0
+  %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #0
+  %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #0
+  %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #0
+  %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #0
+  %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #0
+  %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #0
+  %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #0
+  %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #0
+  %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #0
+  %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #0
+  %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #0
+  %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #0
+  %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #0
+  %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #0
+  %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #0
+  %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #0
+  %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #0
+  %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #0
+  %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #0
+  %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #0
+  %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #0
+  %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #0
+  %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #0
+  %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #0
+  %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #0
+  %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #0
+  %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #0
+  %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #0
+  %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #0
+  %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #0
+  %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #0
+  %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #0
+  %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #0
+  %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #0
+  %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #0
+  %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #0
+  %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #0
+  %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #0
+  %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #0
+  %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #0
+  %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #0
+  %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #0
+  %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #0
+  %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #0
+  %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #0
+  %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #0
+  %cmp = icmp eq i32 %cnd, 0
+  br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
+
+bb2: ; 68 bytes
+  ; 64 byte asm
+  call void asm sideeffect
+   "v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64",""() #0
+  br label %bb3
+
+bb3:
+  tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #0
+  tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #0
+  tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #0
+  tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #0
+  tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #0
+  tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #0
+  tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #0
+  tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #0
+  tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #0
+  tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #0
+  tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #0
+  tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #0
+  tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #0
+  tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #0
+  tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #0
+  tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #0
+  tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #0
+  tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #0
+  tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #0
+  tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #0
+  tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #0
+  tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #0
+  tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #0
+  tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #0
+  tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #0
+  tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #0
+  tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #0
+  tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #0
+  tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #0
+  tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #0
+  tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #0
+  tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #0
+  tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #0
+  tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #0
+  tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #0
+  tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #0
+  tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #0
+  tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #0
+  tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #0
+  tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #0
+  tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #0
+  tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #0
+  tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #0
+  tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #0
+  tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #0
+  tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #0
+  tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #0
+  tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #0
+  tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #0
+  tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #0
+  tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #0
+  tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #0
+  tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #0
+  tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #0
+  tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #0
+  tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #0
+  tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #0
+  tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #0
+  tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #0
+  tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #0
+  tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #0
+  tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #0
+  tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #0
+  tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #0
+  tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #0
+  tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #0
+  tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #0
+  tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #0
+  tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #0
+  tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #0
+  tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #0
+  tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #0
+  tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #0
+  tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #0
+  tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #0
+  tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #0
+  tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #0
+  tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #0
+  tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #0
+  tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #0
+  tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #0
+  tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #0
+  tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #0
+  tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #0
+  tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #0
+  tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #0
+  tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #0
+  tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #0
+  tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #0
+  tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #0
+  tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #0
+  tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #0
+  tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #0
+  tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #0
+  tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #0
+  tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #0
+  tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #0
+  tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #0
+  tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #0
+  tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #0
+  tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #0
+  tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #0
+  tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #0
+  tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #0
+  ret void
+}
+
+; CHECK-LABEL: {{^}}spill_func:
+; GCN:    codeLenInByte = 1612
+; GCN-GCNTRACKERS:    codeLenInByte = 1660
+; GCN:    NumSgprs: 104
+; GCN-GCNTRACKERS:    NumSgprs: 104
+; GCN:    NumVgprs: 3
+; GCN-GCNTRACKERS:    NumVgprs: 4
+; GCN:    ScratchSize: 12
+; GCN-GCNTRACKERS:    ScratchSize: 16
+
+define void @spill_func(ptr addrspace(1) %arg) #0 {
+entry:
+  %cnd = tail call i32 @llvm.amdgcn.workgroup.id.x() #0
+  %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #0
+  %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #0
+  %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #0
+  %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #0
+  %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #0
+  %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #0
+  %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #0
+  %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #0
+  %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #0
+  %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #0
+  %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #0
+  %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #0
+  %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #0
+  %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #0
+  %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #0
+  %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #0
+  %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #0
+  %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #0
+  %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #0
+  %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #0
+  %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #0
+  %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #0
+  %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #0
+  %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #0
+  %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #0
+  %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #0
+  %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #0
+  %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #0
+  %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #0
+  %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #0
+  %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #0
+  %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #0
+  %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #0
+  %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #0
+  %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #0
+  %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #0
+  %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #0
+  %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #0
+  %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #0
+  %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #0
+  %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #0
+  %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #0
+  %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #0
+  %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #0
+  %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #0
+  %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #0
+  %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #0
+  %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #0
+  %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #0
+  %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #0
+  %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #0
+  %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #0
+  %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #0
+  %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #0
+  %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #0
+  %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #0
+  %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #0
+  %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #0
+  %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #0
+  %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #0
+  %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #0
+  %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #0
+  %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #0
+  %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #0
+  %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #0
+  %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #0
+  %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #0
+  %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #0
+  %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #0
+  %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #0
+  %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #0
+  %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #0
+  %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #0
+  %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #0
+  %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #0
+  %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #0
+  %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #0
+  %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #0
+  %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #0
+  %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #0
+  %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #0
+  %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #0
+  %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #0
+  %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #0
+  %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #0
+  %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #0
+  %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #0
+  %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #0
+  %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #0
+  %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #0
+  %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #0
+  %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #0
+  %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #0
+  %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #0
+  %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #0
+  %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #0
+  %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #0
+  %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #0
+  %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #0
+  %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #0
+  %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #0
+  %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #0
+  %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #0
+  %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #0
+  %cmp = icmp eq i32 %cnd, 0
+  br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
+
+bb2: ; 68 bytes
+  ; 64 byte asm
+  call void asm sideeffect
+   "v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64",""() #0
+  br label %bb3
+
+bb3:
+  tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #0
+  tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #0
+  tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #0
+  tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #0
+  tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #0
+  tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #0
+  tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #0
+  tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #0
+  tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #0
+  tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #0
+  tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #0
+  tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #0
+  tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #0
+  tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #0
+  tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #0
+  tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #0
+  tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #0
+  tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #0
+  tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #0
+  tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #0
+  tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #0
+  tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #0
+  tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #0
+  tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #0
+  tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #0
+  tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #0
+  tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #0
+  tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #0
+  tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #0
+  tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #0
+  tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #0
+  tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #0
+  tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #0
+  tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #0
+  tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #0
+  tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #0
+  tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #0
+  tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #0
+  tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #0
+  tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #0
+  tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #0
+  tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #0
+  tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #0
+  tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #0
+  tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #0
+  tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #0
+  tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #0
+  tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #0
+  tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #0
+  tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #0
+  tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #0
+  tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #0
+  tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #0
+  tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #0
+  tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #0
+  tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #0
+  tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #0
+  tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #0
+  tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #0
+  tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #0
+  tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #0
+  tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #0
+  tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #0
+  tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #0
+  tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #0
+  tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #0
+  tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #0
+  tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #0
+  tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #0
+  tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #0
+  tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #0
+  tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #0
+  tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #0
+  tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #0
+  tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #0
+  tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #0
+  tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #0
+  tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #0
+  tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #0
+  tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #0
+  tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #0
+  tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #0
+  tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #0
+  tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #0
+  tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #0
+  tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #0
+  tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #0
+  tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #0
+  tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #0
+  tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #0
+  tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #0
+  tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #0
+  tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #0
+  tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #0
+  tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #0
+  tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #0
+  tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #0
+  tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #0
+  tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #0
+  tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #0
+  tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #0
+  tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #0
+  tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #0
+  tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #0
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
new file mode 100644
index 00000000000000..53f533ebb28427
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -0,0 +1,647 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL-GCNTRACKERS %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -verify-misched < %s | FileCheck --check-prefixes=TONGA %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=TONGA-GCNTRACKERS %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched < %s | FileCheck --check-prefixes=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX908-GCNTRACKERS %s
+; RUN: llc -mtriple=amdgcn -verify-misched < %s | FileCheck --check-prefixes=GENERIC %s
+; RUN: llc -mtriple=amdgcn -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GENERIC-GCNTRACKERS %s
+
+; GCN Trackers are sensitive to minor changes in RP, and will avoid scheduling certain instructions, which, if scheduled, 
+; allow scheduling of other instructions which reduce RP
+
+; CHECK-LABEL: {{^}}return_72xi32:
+; GFX11-PAL:    codeLenInByte = 768
+; GFX11-PAL-GCNTRACKERS:    codeLenInByte = 888
+; GFX11-PAL:    NumSgprs: 33
+; GFX11-PAL-GCNTRACKERS:    NumSgprs: 33
+; GFX11-PAL:    NumVgprs: 64
+; GFX11-PAL-GCNTRACKERS:    NumVgprs: 64
+; GFX11-PAL:    ScratchSize: 220
+; GFX11-PAL-GCNTRACKERS:    ScratchSize: 248
+
+
+; CHECK-LABEL: {{^}}call_72xi32:
+; GFX11-PAL:    codeLenInByte = 1300
+; GFX11-PAL-GCNTRACKERS:    codeLenInByte = 1372
+; GFX11-PAL:    NumSgprs: 35
+; GFX11-PAL-GCNTRACKERS:    NumSgprs: 35
+; GFX11-PAL:    NumVgprs: 64
+; GFX11-PAL-GCNTRACKERS:    NumVgprs: 64
+; GFX11-PAL:    ScratchSize: 2780
+; GFX11-PAL-GCNTRACKERS:    ScratchSize: 2808
+
+
+define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
+  ret <72 x i32> %val
+}
+
+define amdgpu_gfx void @call_72xi32() #1 {
+entry:
+  %ret.0 = call amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> zeroinitializer)
+  %val.0 = insertelement <72 x i32> %ret.0, i32 42, i32 0
+  %val.1 = insertelement <72 x i32> %val.0, i32 24, i32 58
+  %ret.1 = call amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val.1)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}global_extload_v16f16_to_v16f64:
+; TONGA:     codeLenInByte = 420
+; TONGA-GCNTRACKERS:     codeLenInByte = 436
+; TONGA:    NumSgprs: 96
+; TONGA-GCNTRACKERS:    NumSgprs: 96
+; TONGA:    NumVgprs: 33
+; TONGA-GCNTRACKERS:    NumVgprs: 25
+; TONGA:    Occupancy: 7
+; TONGA-GCNTRACKERS:    Occupancy: 8
+
+
+define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in)  {
+  %val = load <16 x half>, ptr addrspace(1) %in
+  %cvt = fpext <16 x half> %val to <16 x double>
+  store <16 x double> %cvt, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}constant_zextload_v64i16_to_v64i32:
+; GENERIC:     codeLenInByte = 860
+; GENERIC-GCNTRACKERS:     codeLenInByte = 860
+; GENERIC:    NumSgprs: 71
+; GENERIC-GCNTRACKERS:    NumSgprs: 54
+; GENERIC:    NumVgprs: 16
+; GENERIC-GCNTRACKERS:    NumVgprs: 16
+; GENERIC:    Occupancy: 7
+; GENERIC-GCNTRACKERS:    Occupancy: 8
+
+define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
+  %load = load <64 x i16>, ptr addrspace(4) %in
+  %ext = zext <64 x i16> %load to <64 x i32>
+  store <64 x i32> %ext, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure:
+; GFX908:     codeLenInByte = 1436
+; GFX908-GCNTRACKERS:     codeLenInByte = 1436
+; GFX908:    NumSgprs: 56
+; GFX908-GCNTRACKERS:    NumSgprs: 56
+; GFX908:    NumVgprs: 43
+; GFX908-GCNTRACKERS:    NumVgprs: 39
+; GFX908:    Occupancy: 5
+; GFX908-GCNTRACKERS:    Occupancy: 6
+
+
+define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) {
+entry:
+  %i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i2 = load i64, ptr addrspace(4) %i, align 8
+  %i3 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %i4 = shl i32 %i3, 8
+  %i5 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !5
+  %i6 = add i32 %i4, %i5
+  %i7 = trunc i64 %i2 to i32
+  %conv = add i32 %i6, %i7
+  %conv.frozen = freeze i32 %conv
+  %div = udiv i32 %conv.frozen, 49
+  %add.ptr22 = getelementptr inbounds float, ptr addrspace(4) %wei_ptr, i64 undef
+  %in.ptr1 = getelementptr inbounds float, ptr addrspace(1) %in, i32 %i5
+  br label %for.cond28.preheader
+
+for.cond28.preheader:                             ; preds = %for.cond28.preheader, %entry
+  %accum.sroa.110.0 = phi float [ 0.000000e+00, %entry ], [ %i251, %for.cond28.preheader ]
+  %accum.sroa.106.0 = phi float [ 0.000000e+00, %entry ], [ %i247, %for.cond28.preheader ]
+  %accum.sroa.102.0 = phi float [ 0.000000e+00, %entry ], [ %i243, %for.cond28.preheader ]
+  %accum.sroa.98.0 = phi float [ 0.000000e+00, %entry ], [ %i239, %for.cond28.preheader ]
+  %accum.sroa.94.0 = phi float [ 0.000000e+00, %entry ], [ %i235, %for.cond28.preheader ]
+  %accum.sroa.90.0 = phi float [ 0.000000e+00, %entry ], [ %i231, %for.cond28.preheader ]
+  %accum.sroa.86.0 = phi float [ 0.000000e+00, %entry ], [ %i227, %for.cond28.preheader ]
+  %accum.sroa.82.0 = phi float [ 0.000000e+00, %entry ], [ %i223, %for.cond28.preheader ]
+  %accum.sroa.78.0 = phi float [ 0.000000e+00, %entry ], [ %i219, %for.cond28.preheader ]
+  %accum.sroa.74.0 = phi float [ 0.000000e+00, %entry ], [ %i215, %for.cond28.preheader ]
+  %accum.sroa.70.0 = phi float [ 0.000000e+00, %entry ], [ %i211, %for.cond28.preheader ]
+  %accum.sroa.66.0 = phi float [ 0.000000e+00, %entry ], [ %i207, %for.cond28.preheader ]
+  %accum.sroa.62.0 = phi float [ 0.000000e+00, %entry ], [ %i203, %for.cond28.preheader ]
+  %accum.sroa.58.0 = phi float [ 0.000000e+00, %entry ], [ %i199, %for.cond28.preheader ]
+  %accum.sroa.54.0 = phi float [ 0.000000e+00, %entry ], [ %i195, %for.cond28.preheader ]
+  %accum.sroa.50.0 = phi float [ 0.000000e+00, %entry ], [ %i191, %for.cond28.preheader ]
+  %accum.sroa.46.0 = phi float [ 0.000000e+00, %entry ], [ %i187, %for.cond28.preheader ]
+  %accum.sroa.42.0 = phi float [ 0.000000e+00, %entry ], [ %i183, %for.cond28.preheader ]
+  %accum.sroa.38.0 = phi float [ 0.000000e+00, %entry ], [ %i179, %for.cond28.preheader ]
+  %accum.sroa.34.0 = phi float [ 0.000000e+00, %entry ], [ %i175, %for.cond28.preheader ]
+  %accum.sroa.30.0 = phi float [ 0.000000e+00, %entry ], [ %i171, %for.cond28.preheader ]
+  %accum.sroa.26.0 = phi float [ 0.000000e+00, %entry ], [ %i167, %for.cond28.preheader ]
+  %accum.sroa.22.0 = phi float [ 0.000000e+00, %entry ], [ %i163, %for.cond28.preheader ]
+  %accum.sroa.18.0 = phi float [ 0.000000e+00, %entry ], [ %i159, %for.cond28.preheader ]
+  %accum.sroa.14.0 = phi float [ 0.000000e+00, %entry ], [ %i155, %for.cond28.preheader ]
+  %accum.sroa.10.0 = phi float [ 0.000000e+00, %entry ], [ %i151, %for.cond28.preheader ]
+  %accum.sroa.6.0 = phi float [ 0.000000e+00, %entry ], [ %i147, %for.cond28.preheader ]
+  %accum.sroa.0.0 = phi float [ 0.000000e+00, %entry ], [ %i143, %for.cond28.preheader ]
+  %accum.sroa.114.0 = phi float [ 0.000000e+00, %entry ], [ %i255, %for.cond28.preheader ]
+  %accum.sroa.118.0 = phi float [ 0.000000e+00, %entry ], [ %i259, %for.cond28.preheader ]
+  %accum.sroa.122.0 = phi float [ 0.000000e+00, %entry ], [ %i263, %for.cond28.preheader ]
+  %accum.sroa.126.0 = phi float [ 0.000000e+00, %entry ], [ %i267, %for.cond28.preheader ]
+  %i_ptr.0288 = phi ptr addrspace(1) [ %in.ptr1, %entry ], [ %add.ptr47.3, %for.cond28.preheader ]
+  %w_ptr.0287 = phi ptr addrspace(4) [ %add.ptr22, %entry ], [ %add.ptr74, %for.cond28.preheader ]
+  %ci.0286 = phi i32 [ 0, %entry ], [ %inc116, %for.cond28.preheader ]
+  %i8 = load float, ptr addrspace(1) %i_ptr.0288, align 4
+  %add.ptr47 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 49
+  %i9 = load float, ptr addrspace(1) %add.ptr47, align 4
+  %add.ptr47.1 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 98
+  %i10 = load float, ptr addrspace(1) %add.ptr47.1, align 4
+  %add.ptr47.2 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 147
+  %i11 = load float, ptr addrspace(1) %add.ptr47.2, align 4
+  %i12 = load float, ptr addrspace(4) %w_ptr.0287, align 4
+  %add.ptr66 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1024
+  %i13 = load float, ptr addrspace(4) %add.ptr66, align 4
+  %add.ptr66.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2048
+  %i14 = load float, ptr addrspace(4) %add.ptr66.1, align 4
+  %add.ptr66.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3072
+  %i15 = load float, ptr addrspace(4) %add.ptr66.2, align 4
+  %add.ptr70 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1
+  %i16 = load float, ptr addrspace(4) %add.ptr70, align 4
+  %add.ptr66.1291 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1025
+  %i17 = load float, ptr addrspace(4) %add.ptr66.1291, align 4
+  %add.ptr66.1.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2049
+  %i18 = load float, ptr addrspace(4) %add.ptr66.1.1, align 4
+  %add.ptr66.2.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3073
+  %i19 = load float, ptr addrspace(4) %add.ptr66.2.1, align 4
+  %add.ptr70.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2
+  %i20 = load float, ptr addrspace(4) %add.ptr70.1, align 4
+  %add.ptr66.2293 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1026
+  %i21 = load float, ptr addrspace(4) %add.ptr66.2293, align 4
+  %add.ptr66.1.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2050
+  %i22 = load float, ptr addrspace(4) %add.ptr66.1.2, align 4
+  %add.ptr66.2.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3074
+  %i23 = load float, ptr addrspace(4) %add.ptr66.2.2, align 4
+  %add.ptr70.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3
+  %i24 = load float, ptr addrspace(4) %add.ptr70.2, align 4
+  %add.ptr66.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1027
+  %i25 = load float, ptr addrspace(4) %add.ptr66.3, align 4
+  %add.ptr66.1.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2051
+  %i26 = load float, ptr addrspace(4) %add.ptr66.1.3, align 4
+  %add.ptr66.2.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3075
+  %i27 = load float, ptr addrspace(4) %add.ptr66.2.3, align 4
+  %add.ptr70.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 4
+  %i28 = load float, ptr addrspace(4) %add.ptr70.3, align 4
+  %add.ptr66.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1028
+  %i29 = load float, ptr addrspace(4) %add.ptr66.4, align 4
+  %add.ptr66.1.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2052
+  %i30 = load float, ptr addrspace(4) %add.ptr66.1.4, align 4
+  %add.ptr66.2.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3076
+  %i31 = load float, ptr addrspace(4) %add.ptr66.2.4, align 4
+  %add.ptr70.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 5
+  %i32 = load float, ptr addrspace(4) %add.ptr70.4, align 4
+  %add.ptr66.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1029
+  %i33 = load float, ptr addrspace(4) %add.ptr66.5, align 4
+  %add.ptr66.1.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2053
+  %i34 = load float, ptr addrspace(4) %add.ptr66.1.5, align 4
+  %add.ptr66.2.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3077
+  %i35 = load float, ptr addrspace(4) %add.ptr66.2.5, align 4
+  %add.ptr70.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 6
+  %i36 = load float, ptr addrspace(4) %add.ptr70.5, align 4
+  %add.ptr66.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1030
+  %i37 = load float, ptr addrspace(4) %add.ptr66.6, align 4
+  %add.ptr66.1.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2054
+  %i38 = load float, ptr addrspace(4) %add.ptr66.1.6, align 4
+  %add.ptr66.2.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3078
+  %i39 = load float, ptr addrspace(4) %add.ptr66.2.6, align 4
+  %add.ptr70.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 7
+  %i40 = load float, ptr addrspace(4) %add.ptr70.6, align 4
+  %add.ptr66.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1031
+  %i41 = load float, ptr addrspace(4) %add.ptr66.7, align 4
+  %add.ptr66.1.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2055
+  %i42 = load float, ptr addrspace(4) %add.ptr66.1.7, align 4
+  %add.ptr66.2.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3079
+  %i43 = load float, ptr addrspace(4) %add.ptr66.2.7, align 4
+  %add.ptr70.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 8
+  %i44 = load float, ptr addrspace(4) %add.ptr70.7, align 4
+  %add.ptr66.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1032
+  %i45 = load float, ptr addrspace(4) %add.ptr66.8, align 4
+  %add.ptr66.1.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2056
+  %i46 = load float, ptr addrspace(4) %add.ptr66.1.8, align 4
+  %add.ptr66.2.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3080
+  %i47 = load float, ptr addrspace(4) %add.ptr66.2.8, align 4
+  %add.ptr70.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 9
+  %i48 = load float, ptr addrspace(4) %add.ptr70.8, align 4
+  %add.ptr66.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1033
+  %i49 = load float, ptr addrspace(4) %add.ptr66.9, align 4
+  %add.ptr66.1.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2057
+  %i50 = load float, ptr addrspace(4) %add.ptr66.1.9, align 4
+  %add.ptr66.2.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3081
+  %i51 = load float, ptr addrspace(4) %add.ptr66.2.9, align 4
+  %add.ptr70.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 10
+  %i52 = load float, ptr addrspace(4) %add.ptr70.9, align 4
+  %add.ptr66.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1034
+  %i53 = load float, ptr addrspace(4) %add.ptr66.10, align 4
+  %add.ptr66.1.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2058
+  %i54 = load float, ptr addrspace(4) %add.ptr66.1.10, align 4
+  %add.ptr66.2.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3082
+  %i55 = load float, ptr addrspace(4) %add.ptr66.2.10, align 4
+  %add.ptr70.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 11
+  %i56 = load float, ptr addrspace(4) %add.ptr70.10, align 4
+  %add.ptr66.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1035
+  %i57 = load float, ptr addrspace(4) %add.ptr66.11, align 4
+  %add.ptr66.1.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2059
+  %i58 = load float, ptr addrspace(4) %add.ptr66.1.11, align 4
+  %add.ptr66.2.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3083
+  %i59 = load float, ptr addrspace(4) %add.ptr66.2.11, align 4
+  %add.ptr70.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 12
+  %i60 = load float, ptr addrspace(4) %add.ptr70.11, align 4
+  %add.ptr66.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1036
+  %i61 = load float, ptr addrspace(4) %add.ptr66.12, align 4
+  %add.ptr66.1.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2060
+  %i62 = load float, ptr addrspace(4) %add.ptr66.1.12, align 4
+  %add.ptr66.2.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3084
+  %i63 = load float, ptr addrspace(4) %add.ptr66.2.12, align 4
+  %add.ptr70.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 13
+  %i64 = load float, ptr addrspace(4) %add.ptr70.12, align 4
+  %add.ptr66.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1037
+  %i65 = load float, ptr addrspace(4) %add.ptr66.13, align 4
+  %add.ptr66.1.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2061
+  %i66 = load float, ptr addrspace(4) %add.ptr66.1.13, align 4
+  %add.ptr66.2.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3085
+  %i67 = load float, ptr addrspace(4) %add.ptr66.2.13, align 4
+  %add.ptr70.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 14
+  %i68 = load float, ptr addrspace(4) %add.ptr70.13, align 4
+  %add.ptr66.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1038
+  %i69 = load float, ptr addrspace(4) %add.ptr66.14, align 4
+  %add.ptr66.1.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2062
+  %i70 = load float, ptr addrspace(4) %add.ptr66.1.14, align 4
+  %add.ptr66.2.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3086
+  %i71 = load float, ptr addrspace(4) %add.ptr66.2.14, align 4
+  %add.ptr70.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 15
+  %i72 = load float, ptr addrspace(4) %add.ptr70.14, align 4
+  %add.ptr66.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1039
+  %i73 = load float, ptr addrspace(4) %add.ptr66.15, align 4
+  %add.ptr66.1.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2063
+  %i74 = load float, ptr addrspace(4) %add.ptr66.1.15, align 4
+  %add.ptr66.2.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3087
+  %i75 = load float, ptr addrspace(4) %add.ptr66.2.15, align 4
+  %add.ptr70.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 16
+  %i76 = load float, ptr addrspace(4) %add.ptr70.15, align 4
+  %add.ptr66.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1040
+  %i77 = load float, ptr addrspace(4) %add.ptr66.16, align 4
+  %add.ptr66.1.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2064
+  %i78 = load float, ptr addrspace(4) %add.ptr66.1.16, align 4
+  %add.ptr66.2.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3088
+  %i79 = load float, ptr addrspace(4) %add.ptr66.2.16, align 4
+  %add.ptr70.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 17
+  %i80 = load float, ptr addrspace(4) %add.ptr70.16, align 4
+  %add.ptr66.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1041
+  %i81 = load float, ptr addrspace(4) %add.ptr66.17, align 4
+  %add.ptr66.1.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2065
+  %i82 = load float, ptr addrspace(4) %add.ptr66.1.17, align 4
+  %add.ptr66.2.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3089
+  %i83 = load float, ptr addrspace(4) %add.ptr66.2.17, align 4
+  %add.ptr70.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 18
+  %i84 = load float, ptr addrspace(4) %add.ptr70.17, align 4
+  %add.ptr66.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1042
+  %i85 = load float, ptr addrspace(4) %add.ptr66.18, align 4
+  %add.ptr66.1.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2066
+  %i86 = load float, ptr addrspace(4) %add.ptr66.1.18, align 4
+  %add.ptr66.2.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3090
+  %i87 = load float, ptr addrspace(4) %add.ptr66.2.18, align 4
+  %add.ptr70.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 19
+  %i88 = load float, ptr addrspace(4) %add.ptr70.18, align 4
+  %add.ptr66.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1043
+  %i89 = load float, ptr addrspace(4) %add.ptr66.19, align 4
+  %add.ptr66.1.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2067
+  %i90 = load float, ptr addrspace(4) %add.ptr66.1.19, align 4
+  %add.ptr66.2.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3091
+  %i91 = load float, ptr addrspace(4) %add.ptr66.2.19, align 4
+  %add.ptr70.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 20
+  %i92 = load float, ptr addrspace(4) %add.ptr70.19, align 4
+  %add.ptr66.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1044
+  %i93 = load float, ptr addrspace(4) %add.ptr66.20, align 4
+  %add.ptr66.1.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2068
+  %i94 = load float, ptr addrspace(4) %add.ptr66.1.20, align 4
+  %add.ptr66.2.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3092
+  %i95 = load float, ptr addrspace(4) %add.ptr66.2.20, align 4
+  %add.ptr70.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 21
+  %i96 = load float, ptr addrspace(4) %add.ptr70.20, align 4
+  %add.ptr66.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1045
+  %i97 = load float, ptr addrspace(4) %add.ptr66.21, align 4
+  %add.ptr66.1.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2069
+  %i98 = load float, ptr addrspace(4) %add.ptr66.1.21, align 4
+  %add.ptr66.2.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3093
+  %i99 = load float, ptr addrspace(4) %add.ptr66.2.21, align 4
+  %add.ptr70.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 22
+  %i100 = load float, ptr addrspace(4) %add.ptr70.21, align 4
+  %add.ptr66.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1046
+  %i101 = load float, ptr addrspace(4) %add.ptr66.22, align 4
+  %add.ptr66.1.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2070
+  %i102 = load float, ptr addrspace(4) %add.ptr66.1.22, align 4
+  %add.ptr66.2.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3094
+  %i103 = load float, ptr addrspace(4) %add.ptr66.2.22, align 4
+  %add.ptr70.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 23
+  %i104 = load float, ptr addrspace(4) %add.ptr70.22, align 4
+  %add.ptr66.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1047
+  %i105 = load float, ptr addrspace(4) %add.ptr66.23, align 4
+  %add.ptr66.1.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2071
+  %i106 = load float, ptr addrspace(4) %add.ptr66.1.23, align 4
+  %add.ptr66.2.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3095
+  %i107 = load float, ptr addrspace(4) %add.ptr66.2.23, align 4
+  %add.ptr70.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 24
+  %i108 = load float, ptr addrspace(4) %add.ptr70.23, align 4
+  %add.ptr66.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1048
+  %i109 = load float, ptr addrspace(4) %add.ptr66.24, align 4
+  %add.ptr66.1.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2072
+  %i110 = load float, ptr addrspace(4) %add.ptr66.1.24, align 4
+  %add.ptr66.2.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3096
+  %i111 = load float, ptr addrspace(4) %add.ptr66.2.24, align 4
+  %add.ptr70.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 25
+  %i112 = load float, ptr addrspace(4) %add.ptr70.24, align 4
+  %add.ptr66.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1049
+  %i113 = load float, ptr addrspace(4) %add.ptr66.25, align 4
+  %add.ptr66.1.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2073
+  %i114 = load float, ptr addrspace(4) %add.ptr66.1.25, align 4
+  %add.ptr66.2.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3097
+  %i115 = load float, ptr addrspace(4) %add.ptr66.2.25, align 4
+  %add.ptr70.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 26
+  %i116 = load float, ptr addrspace(4) %add.ptr70.25, align 4
+  %add.ptr66.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1050
+  %i117 = load float, ptr addrspace(4) %add.ptr66.26, align 4
+  %add.ptr66.1.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2074
+  %i118 = load float, ptr addrspace(4) %add.ptr66.1.26, align 4
+  %add.ptr66.2.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3098
+  %i119 = load float, ptr addrspace(4) %add.ptr66.2.26, align 4
+  %add.ptr70.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 27
+  %i120 = load float, ptr addrspace(4) %add.ptr70.26, align 4
+  %add.ptr66.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1051
+  %i121 = load float, ptr addrspace(4) %add.ptr66.27, align 4
+  %add.ptr66.1.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2075
+  %i122 = load float, ptr addrspace(4) %add.ptr66.1.27, align 4
+  %add.ptr66.2.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3099
+  %i123 = load float, ptr addrspace(4) %add.ptr66.2.27, align 4
+  %add.ptr70.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 28
+  %i124 = load float, ptr addrspace(4) %add.ptr70.27, align 4
+  %add.ptr66.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1052
+  %i125 = load float, ptr addrspace(4) %add.ptr66.28, align 4
+  %add.ptr66.1.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2076
+  %i126 = load float, ptr addrspace(4) %add.ptr66.1.28, align 4
+  %add.ptr66.2.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3100
+  %i127 = load float, ptr addrspace(4) %add.ptr66.2.28, align 4
+  %add.ptr70.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 29
+  %i128 = load float, ptr addrspace(4) %add.ptr70.28, align 4
+  %add.ptr66.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1053
+  %i129 = load float, ptr addrspace(4) %add.ptr66.29, align 4
+  %add.ptr66.1.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2077
+  %i130 = load float, ptr addrspace(4) %add.ptr66.1.29, align 4
+  %add.ptr66.2.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3101
+  %i131 = load float, ptr addrspace(4) %add.ptr66.2.29, align 4
+  %add.ptr70.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 30
+  %i132 = load float, ptr addrspace(4) %add.ptr70.29, align 4
+  %add.ptr66.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1054
+  %i133 = load float, ptr addrspace(4) %add.ptr66.30, align 4
+  %add.ptr66.1.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2078
+  %i134 = load float, ptr addrspace(4) %add.ptr66.1.30, align 4
+  %add.ptr66.2.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3102
+  %i135 = load float, ptr addrspace(4) %add.ptr66.2.30, align 4
+  %add.ptr70.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 31
+  %i136 = load float, ptr addrspace(4) %add.ptr70.30, align 4
+  %add.ptr66.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1055
+  %i137 = load float, ptr addrspace(4) %add.ptr66.31, align 4
+  %add.ptr66.1.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2079
+  %i138 = load float, ptr addrspace(4) %add.ptr66.1.31, align 4
+  %add.ptr66.2.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3103
+  %i139 = load float, ptr addrspace(4) %add.ptr66.2.31, align 4
+  %add.ptr47.3 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 196
+  %i140 = tail call float @llvm.fmuladd.f32(float %i8, float %i12, float %accum.sroa.0.0)
+  %i141 = tail call float @llvm.fmuladd.f32(float %i9, float %i13, float %i140)
+  %i142 = tail call float @llvm.fmuladd.f32(float %i10, float %i14, float %i141)
+  %i143 = tail call float @llvm.fmuladd.f32(float %i11, float %i15, float %i142)
+  %i144 = tail call float @llvm.fmuladd.f32(float %i8, float %i16, float %accum.sroa.6.0)
+  %i145 = tail call float @llvm.fmuladd.f32(float %i9, float %i17, float %i144)
+  %i146 = tail call float @llvm.fmuladd.f32(float %i10, float %i18, float %i145)
+  %i147 = tail call float @llvm.fmuladd.f32(float %i11, float %i19, float %i146)
+  %i148 = tail call float @llvm.fmuladd.f32(float %i8, float %i20, float %accum.sroa.10.0)
+  %i149 = tail call float @llvm.fmuladd.f32(float %i9, float %i21, float %i148)
+  %i150 = tail call float @llvm.fmuladd.f32(float %i10, float %i22, float %i149)
+  %i151 = tail call float @llvm.fmuladd.f32(float %i11, float %i23, float %i150)
+  %i152 = tail call float @llvm.fmuladd.f32(float %i8, float %i24, float %accum.sroa.14.0)
+  %i153 = tail call float @llvm.fmuladd.f32(float %i9, float %i25, float %i152)
+  %i154 = tail call float @llvm.fmuladd.f32(float %i10, float %i26, float %i153)
+  %i155 = tail call float @llvm.fmuladd.f32(float %i11, float %i27, float %i154)
+  %i156 = tail call float @llvm.fmuladd.f32(float %i8, float %i28, float %accum.sroa.18.0)
+  %i157 = tail call float @llvm.fmuladd.f32(float %i9, float %i29, float %i156)
+  %i158 = tail call float @llvm.fmuladd.f32(float %i10, float %i30, float %i157)
+  %i159 = tail call float @llvm.fmuladd.f32(float %i11, float %i31, float %i158)
+  %i160 = tail call float @llvm.fmuladd.f32(float %i8, float %i32, float %accum.sroa.22.0)
+  %i161 = tail call float @llvm.fmuladd.f32(float %i9, float %i33, float %i160)
+  %i162 = tail call float @llvm.fmuladd.f32(float %i10, float %i34, float %i161)
+  %i163 = tail call float @llvm.fmuladd.f32(float %i11, float %i35, float %i162)
+  %i164 = tail call float @llvm.fmuladd.f32(float %i8, float %i36, float %accum.sroa.26.0)
+  %i165 = tail call float @llvm.fmuladd.f32(float %i9, float %i37, float %i164)
+  %i166 = tail call float @llvm.fmuladd.f32(float %i10, float %i38, float %i165)
+  %i167 = tail call float @llvm.fmuladd.f32(float %i11, float %i39, float %i166)
+  %i168 = tail call float @llvm.fmuladd.f32(float %i8, float %i40, float %accum.sroa.30.0)
+  %i169 = tail call float @llvm.fmuladd.f32(float %i9, float %i41, float %i168)
+  %i170 = tail call float @llvm.fmuladd.f32(float %i10, float %i42, float %i169)
+  %i171 = tail call float @llvm.fmuladd.f32(float %i11, float %i43, float %i170)
+  %i172 = tail call float @llvm.fmuladd.f32(float %i8, float %i44, float %accum.sroa.34.0)
+  %i173 = tail call float @llvm.fmuladd.f32(float %i9, float %i45, float %i172)
+  %i174 = tail call float @llvm.fmuladd.f32(float %i10, float %i46, float %i173)
+  %i175 = tail call float @llvm.fmuladd.f32(float %i11, float %i47, float %i174)
+  %i176 = tail call float @llvm.fmuladd.f32(float %i8, float %i48, float %accum.sroa.38.0)
+  %i177 = tail call float @llvm.fmuladd.f32(float %i9, float %i49, float %i176)
+  %i178 = tail call float @llvm.fmuladd.f32(float %i10, float %i50, float %i177)
+  %i179 = tail call float @llvm.fmuladd.f32(float %i11, float %i51, float %i178)
+  %i180 = tail call float @llvm.fmuladd.f32(float %i8, float %i52, float %accum.sroa.42.0)
+  %i181 = tail call float @llvm.fmuladd.f32(float %i9, float %i53, float %i180)
+  %i182 = tail call float @llvm.fmuladd.f32(float %i10, float %i54, float %i181)
+  %i183 = tail call float @llvm.fmuladd.f32(float %i11, float %i55, float %i182)
+  %i184 = tail call float @llvm.fmuladd.f32(float %i8, float %i56, float %accum.sroa.46.0)
+  %i185 = tail call float @llvm.fmuladd.f32(float %i9, float %i57, float %i184)
+  %i186 = tail call float @llvm.fmuladd.f32(float %i10, float %i58, float %i185)
+  %i187 = tail call float @llvm.fmuladd.f32(float %i11, float %i59, float %i186)
+  %i188 = tail call float @llvm.fmuladd.f32(float %i8, float %i60, float %accum.sroa.50.0)
+  %i189 = tail call float @llvm.fmuladd.f32(float %i9, float %i61, float %i188)
+  %i190 = tail call float @llvm.fmuladd.f32(float %i10, float %i62, float %i189)
+  %i191 = tail call float @llvm.fmuladd.f32(float %i11, float %i63, float %i190)
+  %i192 = tail call float @llvm.fmuladd.f32(float %i8, float %i64, float %accum.sroa.54.0)
+  %i193 = tail call float @llvm.fmuladd.f32(float %i9, float %i65, float %i192)
+  %i194 = tail call float @llvm.fmuladd.f32(float %i10, float %i66, float %i193)
+  %i195 = tail call float @llvm.fmuladd.f32(float %i11, float %i67, float %i194)
+  %i196 = tail call float @llvm.fmuladd.f32(float %i8, float %i68, float %accum.sroa.58.0)
+  %i197 = tail call float @llvm.fmuladd.f32(float %i9, float %i69, float %i196)
+  %i198 = tail call float @llvm.fmuladd.f32(float %i10, float %i70, float %i197)
+  %i199 = tail call float @llvm.fmuladd.f32(float %i11, float %i71, float %i198)
+  %i200 = tail call float @llvm.fmuladd.f32(float %i8, float %i72, float %accum.sroa.62.0)
+  %i201 = tail call float @llvm.fmuladd.f32(float %i9, float %i73, float %i200)
+  %i202 = tail call float @llvm.fmuladd.f32(float %i10, float %i74, float %i201)
+  %i203 = tail call float @llvm.fmuladd.f32(float %i11, float %i75, float %i202)
+  %i204 = tail call float @llvm.fmuladd.f32(float %i8, float %i76, float %accum.sroa.66.0)
+  %i205 = tail call float @llvm.fmuladd.f32(float %i9, float %i77, float %i204)
+  %i206 = tail call float @llvm.fmuladd.f32(float %i10, float %i78, float %i205)
+  %i207 = tail call float @llvm.fmuladd.f32(float %i11, float %i79, float %i206)
+  %i208 = tail call float @llvm.fmuladd.f32(float %i8, float %i80, float %accum.sroa.70.0)
+  %i209 = tail call float @llvm.fmuladd.f32(float %i9, float %i81, float %i208)
+  %i210 = tail call float @llvm.fmuladd.f32(float %i10, float %i82, float %i209)
+  %i211 = tail call float @llvm.fmuladd.f32(float %i11, float %i83, float %i210)
+  %i212 = tail call float @llvm.fmuladd.f32(float %i8, float %i84, float %accum.sroa.74.0)
+  %i213 = tail call float @llvm.fmuladd.f32(float %i9, float %i85, float %i212)
+  %i214 = tail call float @llvm.fmuladd.f32(float %i10, float %i86, float %i213)
+  %i215 = tail call float @llvm.fmuladd.f32(float %i11, float %i87, float %i214)
+  %i216 = tail call float @llvm.fmuladd.f32(float %i8, float %i88, float %accum.sroa.78.0)
+  %i217 = tail call float @llvm.fmuladd.f32(float %i9, float %i89, float %i216)
+  %i218 = tail call float @llvm.fmuladd.f32(float %i10, float %i90, float %i217)
+  %i219 = tail call float @llvm.fmuladd.f32(float %i11, float %i91, float %i218)
+  %i220 = tail call float @llvm.fmuladd.f32(float %i8, float %i92, float %accum.sroa.82.0)
+  %i221 = tail call float @llvm.fmuladd.f32(float %i9, float %i93, float %i220)
+  %i222 = tail call float @llvm.fmuladd.f32(float %i10, float %i94, float %i221)
+  %i223 = tail call float @llvm.fmuladd.f32(float %i11, float %i95, float %i222)
+  %i224 = tail call float @llvm.fmuladd.f32(float %i8, float %i96, float %accum.sroa.86.0)
+  %i225 = tail call float @llvm.fmuladd.f32(float %i9, float %i97, float %i224)
+  %i226 = tail call float @llvm.fmuladd.f32(float %i10, float %i98, float %i225)
+  %i227 = tail call float @llvm.fmuladd.f32(float %i11, float %i99, float %i226)
+  %i228 = tail call float @llvm.fmuladd.f32(float %i8, float %i100, float %accum.sroa.90.0)
+  %i229 = tail call float @llvm.fmuladd.f32(float %i9, float %i101, float %i228)
+  %i230 = tail call float @llvm.fmuladd.f32(float %i10, float %i102, float %i229)
+  %i231 = tail call float @llvm.fmuladd.f32(float %i11, float %i103, float %i230)
+  %i232 = tail call float @llvm.fmuladd.f32(float %i8, float %i104, float %accum.sroa.94.0)
+  %i233 = tail call float @llvm.fmuladd.f32(float %i9, float %i105, float %i232)
+  %i234 = tail call float @llvm.fmuladd.f32(float %i10, float %i106, float %i233)
+  %i235 = tail call float @llvm.fmuladd.f32(float %i11, float %i107, float %i234)
+  %i236 = tail call float @llvm.fmuladd.f32(float %i8, float %i108, float %accum.sroa.98.0)
+  %i237 = tail call float @llvm.fmuladd.f32(float %i9, float %i109, float %i236)
+  %i238 = tail call float @llvm.fmuladd.f32(float %i10, float %i110, float %i237)
+  %i239 = tail call float @llvm.fmuladd.f32(float %i11, float %i111, float %i238)
+  %i240 = tail call float @llvm.fmuladd.f32(float %i8, float %i112, float %accum.sroa.102.0)
+  %i241 = tail call float @llvm.fmuladd.f32(float %i9, float %i113, float %i240)
+  %i242 = tail call float @llvm.fmuladd.f32(float %i10, float %i114, float %i241)
+  %i243 = tail call float @llvm.fmuladd.f32(float %i11, float %i115, float %i242)
+  %i244 = tail call float @llvm.fmuladd.f32(float %i8, float %i116, float %accum.sroa.106.0)
+  %i245 = tail call float @llvm.fmuladd.f32(float %i9, float %i117, float %i244)
+  %i246 = tail call float @llvm.fmuladd.f32(float %i10, float %i118, float %i245)
+  %i247 = tail call float @llvm.fmuladd.f32(float %i11, float %i119, float %i246)
+  %i248 = tail call float @llvm.fmuladd.f32(float %i8, float %i120, float %accum.sroa.110.0)
+  %i249 = tail call float @llvm.fmuladd.f32(float %i9, float %i121, float %i248)
+  %i250 = tail call float @llvm.fmuladd.f32(float %i10, float %i122, float %i249)
+  %i251 = tail call float @llvm.fmuladd.f32(float %i11, float %i123, float %i250)
+  %i252 = tail call float @llvm.fmuladd.f32(float %i8, float %i124, float %accum.sroa.114.0)
+  %i253 = tail call float @llvm.fmuladd.f32(float %i9, float %i125, float %i252)
+  %i254 = tail call float @llvm.fmuladd.f32(float %i10, float %i126, float %i253)
+  %i255 = tail call float @llvm.fmuladd.f32(float %i11, float %i127, float %i254)
+  %i256 = tail call float @llvm.fmuladd.f32(float %i8, float %i128, float %accum.sroa.118.0)
+  %i257 = tail call float @llvm.fmuladd.f32(float %i9, float %i129, float %i256)
+  %i258 = tail call float @llvm.fmuladd.f32(float %i10, float %i130, float %i257)
+  %i259 = tail call float @llvm.fmuladd.f32(float %i11, float %i131, float %i258)
+  %i260 = tail call float @llvm.fmuladd.f32(float %i8, float %i132, float %accum.sroa.122.0)
+  %i261 = tail call float @llvm.fmuladd.f32(float %i9, float %i133, float %i260)
+  %i262 = tail call float @llvm.fmuladd.f32(float %i10, float %i134, float %i261)
+  %i263 = tail call float @llvm.fmuladd.f32(float %i11, float %i135, float %i262)
+  %i264 = tail call float @llvm.fmuladd.f32(float %i8, float %i136, float %accum.sroa.126.0)
+  %i265 = tail call float @llvm.fmuladd.f32(float %i9, float %i137, float %i264)
+  %i266 = tail call float @llvm.fmuladd.f32(float %i10, float %i138, float %i265)
+  %i267 = tail call float @llvm.fmuladd.f32(float %i11, float %i139, float %i266)
+  %add.ptr74 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 4096
+  %inc116 = add nuw nsw i32 %ci.0286, 1
+  %exitcond.not = icmp eq i32 %inc116, 512
+  br i1 %exitcond.not, label %for.cond.cleanup26, label %for.cond28.preheader
+
+for.cond.cleanup26:                               ; preds = %for.cond28.preheader
+  %mul119 = shl nuw nsw i32 undef, 1
+  %mul120 = mul i32 %div, 200704
+  %mul121 = mul i32 undef, 6272
+  %add122 = add i32 %mul120, %mul121
+  %mul123 = mul nuw nsw i32 undef, 28
+  %add124 = add i32 %add122, %mul123
+  %add126 = add i32 %add124, %mul119
+  %idx.ext127 = zext i32 %add126 to i64
+  %add.ptr128 = getelementptr inbounds float, ptr addrspace(1) %out_ptr, i64 %idx.ext127
+  store float %i143, ptr addrspace(1) %add.ptr128, align 4
+  %add.ptr184 = getelementptr inbounds float, ptr addrspace(1) %add.ptr128, i64 196
+  store float %i147, ptr addrspace(1) %add.ptr184, align 4
+  %add.ptr167.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184, i64 14
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr167.1, align 4
+  %add.ptr175.1.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr167.1, i64 1
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr175.1.1, align 4
+  %add.ptr184.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184, i64 196
+  store float %i151, ptr addrspace(1) %add.ptr184.1, align 4
+  %add.ptr184.2 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.1, i64 196
+  store float %i155, ptr addrspace(1) %add.ptr184.2, align 4
+  %add.ptr184.3 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.2, i64 196
+  store float %i159, ptr addrspace(1) %add.ptr184.3, align 4
+  %add.ptr184.4 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.3, i64 196
+  store float %i163, ptr addrspace(1) %add.ptr184.4, align 4
+  %add.ptr154.5 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.4, i64 1
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr154.5, align 4
+  %add.ptr184.5 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.4, i64 196
+  store float %i167, ptr addrspace(1) %add.ptr184.5, align 4
+  %add.ptr154.6 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.5, i64 1
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr154.6, align 4
+  %add.ptr184.6 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.5, i64 196
+  store float %i171, ptr addrspace(1) %add.ptr184.6, align 4
+  %add.ptr184.7 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.6, i64 196
+  store float %i175, ptr addrspace(1) %add.ptr184.7, align 4
+  %add.ptr167.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.7, i64 14
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr167.8, align 4
+  %add.ptr175.1.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr167.8, i64 1
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr175.1.8, align 4
+  %add.ptr184.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.7, i64 196
+  store float %i179, ptr addrspace(1) %add.ptr184.8, align 4
+  %add.ptr184.9 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.8, i64 196
+  store float %i183, ptr addrspace(1) %add.ptr184.9, align 4
+  %add.ptr184.10 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.9, i64 196
+  store float %i187, ptr addrspace(1) %add.ptr184.10, align 4
+  %add.ptr184.11 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.10, i64 196
+  store float %i191, ptr addrspace(1) %add.ptr184.11, align 4
+  %add.ptr184.12 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.11, i64 196
+  store float %i195, ptr addrspace(1) %add.ptr184.12, align 4
+  %add.ptr184.13 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.12, i64 196
+  store float %i199, ptr addrspace(1) %add.ptr184.13, align 4
+  %add.ptr184.14 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.13, i64 196
+  store float %i203, ptr addrspace(1) %add.ptr184.14, align 4
+  %add.ptr184.15 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.14, i64 196
+  store float %i207, ptr addrspace(1) %add.ptr184.15, align 4
+  %add.ptr184.16 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.15, i64 196
+  store float %i211, ptr addrspace(1) %add.ptr184.16, align 4
+  %add.ptr184.17 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.16, i64 196
+  store float %i215, ptr addrspace(1) %add.ptr184.17, align 4
+  %add.ptr184.18 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.17, i64 196
+  store float %i219, ptr addrspace(1) %add.ptr184.18, align 4
+  %add.ptr184.19 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.18, i64 196
+  store float %i223, ptr addrspace(1) %add.ptr184.19, align 4
+  %add.ptr184.20 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.19, i64 196
+  store float %i227, ptr addrspace(1) %add.ptr184.20, align 4
+  %add.ptr184.21 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.20, i64 196
+  store float %i231, ptr addrspace(1) %add.ptr184.21, align 4
+  %add.ptr184.22 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.21, i64 196
+  store float %i235, ptr addrspace(1) %add.ptr184.22, align 4
+  %add.ptr184.23 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.22, i64 196
+  store float %i239, ptr addrspace(1) %add.ptr184.23, align 4
+  %add.ptr184.24 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.23, i64 196
+  store float %i243, ptr addrspace(1) %add.ptr184.24, align 4
+  %add.ptr184.25 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.24, i64 196
+  store float %i247, ptr addrspace(1) %add.ptr184.25, align 4
+  %add.ptr184.26 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.25, i64 196
+  store float %i251, ptr addrspace(1) %add.ptr184.26, align 4
+  %add.ptr184.27 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.26, i64 196
+  store float %i255, ptr addrspace(1) %add.ptr184.27, align 4
+  %add.ptr184.28 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.27, i64 196
+  store float %i259, ptr addrspace(1) %add.ptr184.28, align 4
+  %add.ptr184.29 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.28, i64 196
+  store float %i263, ptr addrspace(1) %add.ptr184.29, align 4
+  %add.ptr184.30 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.29, i64 196
+  store float %i267, ptr addrspace(1) %add.ptr184.30, align 4
+  ret void
+}
+
+
+
+declare float @llvm.fmuladd.f32(float, float, float) #2
+declare i32 @llvm.amdgcn.workitem.id.x() #3
+declare i32 @llvm.amdgcn.workgroup.id.x() #3
+declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3
+
+!0 = !{i32 1, i32 2, i32 1, i32 0}
+!1 = !{!"none", !"none", !"none", !"none"}
+!2 = !{!"ptr", !"ptr", !"ptr", !"float"}
+!3 = !{!"restrict const", !"restrict const", !"restrict", !""}
+!4 = !{i32 256, i32 1, i32 1}
+!5 = !{i32 0, i32 1024}
+
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1" }
+attributes #1 = { nounwind "amdgpu-num-vgpr"="64" }
+attributes #2 = { nofree nosync nounwind readnone speculatable willreturn }
+attributes #3 = { nounwind readnone speculatable willreturn }
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
index 14bb4310c619ea..3ce6279f9082fb 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,1 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
 
 --- |
   define amdgpu_kernel void @no_sched_metric_due_to_spills() #0 {
@@ -11,6 +12,20 @@
 # GCN-LABEL: name: no_sched_metric_due_to_spills
 # GCN-NOT: SI_SPILL_
 # GCN: S_ENDPGM
+
+# GCN-GCNTRACKER-LABEL: name: no_sched_metric_due_to_spills
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: S_ENDPGM
+
+# When using the GCN Trackers, the scheduler is able to acieve desired occupancy without running high-RP-reschedule stage. However, the RP is still high,
+# and RA is unable to allocate without spills. By running the high-RP-reschedule schedule we would have furhter decreased RP, which provides increased 
+# flexibility for RA.
+
 ---
 name:            no_sched_metric_due_to_spills
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
index 94815558bf3d6d..71f8d91874f04f 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
@@ -1,16 +1,24 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs  < %s | FileCheck --check-prefix=OCC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs  < %s | FileCheck --check-prefix=OCC-GCNTRACKER %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs -amdgpu-schedule-relaxed-occupancy=true  < %s | FileCheck --check-prefix=RELAX %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -amdgpu-schedule-relaxed-occupancy=true  < %s | FileCheck --check-prefix=RELAX-GCNTRACKER %s
 
 
 ; Using -amgpu-schedule-relaxed-occupancy allows scheduler to produce better ILP by further relaxing occupancy target
 
-; GCN-LABEL: {{^}}load_fma_store:
+; CHECK-LABEL: {{^}}load_fma_store:
 ; OCC:    NumVgprs: 32
+; OCC-GCNTRACKER:    NumVgprs: 24
 ; RELAX:    NumVgprs: 64
+; RELAX-GCNTRACKER:    NumVgprs: 60
 ; OCC: NumVGPRsForWavesPerEU: 32
+; OCC-GCNTRACKER: NumVGPRsForWavesPerEU: 24
 ; RELAX: NumVGPRsForWavesPerEU: 64
+; RELAX-GCNTRACKER: NumVGPRsForWavesPerEU: 60
 ; OCC:    Occupancy: 8
+; OCC-GCNTRACKER:    Occupancy: 8
 ; RELAX: Occupancy: 4
+; RELAX-GCNTRACKER: Occupancy: 4
 
 define amdgpu_kernel void @load_fma_store(ptr addrspace(3) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) #1 {
 bb:

>From 7ee4ffdb98697d661420901055782ccc607565f5 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 27 Sep 2024 12:40:02 -0700
Subject: [PATCH 22/26] Remove CurrLIS

Change-Id: I228916bf04add1de7615294d1e58ee4213f0bbde
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp   | 21 ++++++++-------------
 llvm/lib/Target/AMDGPU/GCNRegPressure.h     | 10 ++++------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp |  2 +-
 3 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index fb92924363d43b..888b5907a979e7 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -595,26 +595,22 @@ bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
 }
 
 bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI,
-                                             bool UseInternalIterator,
-                                             LiveIntervals *TheLIS) {
+                                             bool UseInternalIterator) {
   assert(MRI && "call reset first");
   SlotIndex SI;
-  const LiveIntervals *CurrLIS;
   const MachineInstr *CurrMI;
   if (UseInternalIterator) {
     if (!LastTrackedMI)
       return NextMI == MBBEnd;
 
     assert(NextMI == MBBEnd || !NextMI->isDebugInstr());
-    CurrLIS = &LIS;
     CurrMI = LastTrackedMI;
 
     SI = NextMI == MBBEnd
-             ? CurrLIS->getInstructionIndex(*LastTrackedMI).getDeadSlot()
-             : CurrLIS->getInstructionIndex(*NextMI).getBaseIndex();
+             ? LIS.getInstructionIndex(*LastTrackedMI).getDeadSlot()
+             : LIS.getInstructionIndex(*NextMI).getBaseIndex();
   } else { //! UseInternalIterator
-    CurrLIS = TheLIS;
-    SI = CurrLIS->getInstructionIndex(*MI).getBaseIndex();
+    SI = LIS.getInstructionIndex(*MI).getBaseIndex();
     CurrMI = MI;
   }
 
@@ -631,7 +627,7 @@ bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI,
       continue;
     if (!SeenRegs.insert(MO.getReg()).second)
       continue;
-    const LiveInterval &LI = CurrLIS->getInterval(MO.getReg());
+    const LiveInterval &LI = LIS.getInterval(MO.getReg());
     if (LI.hasSubRanges()) {
       auto It = LiveRegs.end();
       for (const auto &S : LI.subranges()) {
@@ -689,16 +685,15 @@ void GCNDownwardRPTracker::advanceToNext(MachineInstr *MI,
   MaxPressure = max(MaxPressure, CurPressure);
 }
 
-bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator,
-                                   LiveIntervals *TheLIS) {
+bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator) {
   if (UseInternalIterator && NextMI == MBBEnd)
     return false;
 
-  advanceBeforeNext(MI, UseInternalIterator, TheLIS);
+  advanceBeforeNext(MI, UseInternalIterator);
   advanceToNext(MI, UseInternalIterator);
   if (!UseInternalIterator) {
     // We must remove any dead def lanes from the current RP
-    advanceBeforeNext(MI, true, TheLIS);
+    advanceBeforeNext(MI, true);
   }
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 463da472bb69ff..169c2e42c08054 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -280,10 +280,9 @@ class GCNDownwardRPTracker : public GCNRPTracker {
   /// it is assumed that the tracker is using an externally managed iterator,
   /// and advance* calls will not update the state of the iterator. In such
   /// cases, the tracker will move to the state right before the provided \p MI
-  /// and use the provided \p TheLIS for RP calculations.
+  /// and use LIS for RP calculations.
   bool advanceBeforeNext(MachineInstr *MI = nullptr,
-                         bool UseInternalIterator = true,
-                         LiveIntervals *TheLIS = nullptr);
+                         bool UseInternalIterator = true);
 
   /// Move to the state at the MI, advanceBeforeNext has to be called first.
   /// If \p UseInternalIterator is true, then internal iterators are used and
@@ -300,9 +299,8 @@ class GCNDownwardRPTracker : public GCNRPTracker {
   /// then it is assumed that the tracker is using an externally managed
   /// iterator, and advance* calls will not update the state of the iterator. In
   /// such cases, the tracker will move to the state right before the provided
-  /// \p MI and use the provided \p TheLIS for RP calculations.
-  bool advance(MachineInstr *MI = nullptr, bool UseInternalIterator = true,
-               LiveIntervals *TheLIS = nullptr);
+  /// \p MI and use LIS for RP calculations.
+  bool advance(MachineInstr *MI = nullptr, bool UseInternalIterator = true);
 
   /// Advance instructions until before \p End.
   bool advance(MachineBasicBlock::const_iterator End);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 28ca41d2dc96ed..b47cdb2e7ddcf1 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -490,7 +490,7 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
 void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
   if (GCNTrackers) {
     MachineInstr *MI = SU->getInstr();
-    IsTopNode ? (void)DownwardTracker.advance(MI, false, DAG->getLIS())
+    IsTopNode ? (void)DownwardTracker.advance(MI, false)
               : UpwardTracker.recede(*MI);
   }
 

>From bf61d05d7a7af61f4f6d9c3452f1f817f84f548b Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 3 Oct 2024 13:42:41 -0700
Subject: [PATCH 23/26] Mark speculative query methods as const

Change-Id: I9ebe0cf7252068dcee90d419945085efae75547d
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp   | 55 +++++++++------------
 llvm/lib/Target/AMDGPU/GCNRegPressure.h     |  6 ++-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 21 +++-----
 3 files changed, 33 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 888b5907a979e7..a7a3c65c3388b3 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -448,19 +448,6 @@ void GCNRPTracker::reset(const MachineRegisterInfo &MRI_,
   MaxPressure = CurPressure = getRegPressure(MRI_, LiveRegs_);
 }
 
-void GCNRPTracker::bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs) {
-  GCNRegPressure TempPressure = CurPressure;
-  for (const RegisterMaskPair &P : DeadDefs) {
-    Register Reg = P.RegUnit;
-    if (!Reg.isVirtual())
-      continue;
-    LaneBitmask LiveMask = LiveRegs[Reg];
-    LaneBitmask BumpedMask = LiveMask | P.LaneMask;
-    CurPressure.inc(Reg, LiveMask, BumpedMask, *MRI);
-  }
-  MaxPressure = max(MaxPressure, CurPressure);
-  CurPressure = TempPressure;
-}
 /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
 LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit,
                                            SlotIndex Pos) const {
@@ -535,8 +522,9 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   assert(CurPressure == getRegPressure(*MRI, LiveRegs));
 }
 
-void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI,
-                                            const SIRegisterInfo *TRI) {
+GCNRegPressure
+GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI,
+                                       const SIRegisterInfo *TRI) const {
   assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
 
   SlotIndex SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot();
@@ -549,33 +537,32 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI,
   adjustDefLaneLiveness(RegOpers.Defs, SlotIdx, LIS, *MRI);
   RegOpers.detectDeadDefs(*MI, LIS);
 
-  // Boost max pressure for all dead defs together.
-  // Since CurrSetPressure and MaxSetPressure
-  bumpDeadDefs(RegOpers.DeadDefs);
+  GCNRegPressure TempPressure = CurPressure;
 
   // Kill liveness at live defs.
   for (const RegisterMaskPair &P : RegOpers.Defs) {
     Register Reg = P.RegUnit;
     if (!Reg.isVirtual())
       continue;
-    LaneBitmask LiveAfter = LiveRegs[Reg];
+    LaneBitmask LiveAfter =
+        LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0);
     LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, Reg);
     LaneBitmask DefLanes = P.LaneMask;
     LaneBitmask LiveBefore = (LiveAfter & ~DefLanes) | UseLanes;
 
-    CurPressure.inc(Reg, LiveAfter, LiveAfter & LiveBefore, *MRI);
-    MaxPressure = max(MaxPressure, CurPressure);
+    TempPressure.inc(Reg, LiveAfter, LiveAfter & LiveBefore, *MRI);
   }
   // Generate liveness for uses.
   for (const RegisterMaskPair &P : RegOpers.Uses) {
     Register Reg = P.RegUnit;
     if (!Reg.isVirtual())
       continue;
-    LaneBitmask LiveAfter = LiveRegs[Reg];
+    LaneBitmask LiveAfter =
+        LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0);
     LaneBitmask LiveBefore = LiveAfter | P.LaneMask;
-    CurPressure.inc(Reg, LiveAfter, LiveBefore, *MRI);
+    TempPressure.inc(Reg, LiveAfter, LiveBefore, *MRI);
   }
-  MaxPressure = max(MaxPressure, CurPressure);
+  return TempPressure;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -736,8 +723,9 @@ Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
   });
 }
 
-void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
-                                                const SIRegisterInfo *TRI) {
+GCNRegPressure
+GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
+                                           const SIRegisterInfo *TRI) const {
   assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
 
   SlotIndex SlotIdx;
@@ -747,6 +735,7 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
   RegisterOperands RegOpers;
   RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/false);
   RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
+  GCNRegPressure TempPressure = CurPressure;
 
   for (const RegisterMaskPair &Use : RegOpers.Uses) {
     Register Reg = Use.RegUnit;
@@ -775,9 +764,10 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
     if (LastUseMask.none())
       continue;
 
-    LaneBitmask LiveMask = LiveRegs[Reg];
+    LaneBitmask LiveMask =
+        LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0);
     LaneBitmask NewMask = LiveMask & ~LastUseMask;
-    CurPressure.inc(Reg, LiveMask, NewMask, *MRI);
+    TempPressure.inc(Reg, LiveMask, NewMask, *MRI);
   }
 
   // Generate liveness for defs.
@@ -785,14 +775,13 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
     Register Reg = Def.RegUnit;
     if (!Reg.isVirtual())
       continue;
-    LaneBitmask LiveMask = LiveRegs[Reg];
+    LaneBitmask LiveMask =
+        LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0);
     LaneBitmask NewMask = LiveMask | Def.LaneMask;
-    CurPressure.inc(Reg, LiveMask, NewMask, *MRI);
+    TempPressure.inc(Reg, LiveMask, NewMask, *MRI);
   }
-  MaxPressure = max(MaxPressure, CurPressure);
 
-  // Boost pressure for all dead defs together.
-  bumpDeadDefs(RegOpers.DeadDefs);
+  return TempPressure;
 }
 
 bool GCNUpwardRPTracker::isValid() const {
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 169c2e42c08054..a583efb457aea6 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -228,7 +228,8 @@ class GCNUpwardRPTracker : public GCNRPTracker {
   /// does not rely on the implicit program ordering in the LiveIntervals to
   /// support RP Speculation. It leaves the state of pressure inconsistent with
   /// the current position
-  void bumpUpwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI);
+  GCNRegPressure bumpUpwardPressure(const MachineInstr *MI,
+                                    const SIRegisterInfo *TRI) const;
 
   /// \p returns whether the tracker's state after receding MI corresponds
   /// to reported by LIS.
@@ -315,7 +316,8 @@ class GCNDownwardRPTracker : public GCNRPTracker {
   /// does not rely on the implicit program ordering in the LiveIntervals to
   /// support RP Speculation. It leaves the state of pressure inconsistent with
   /// the current position
-  void bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI);
+  GCNRegPressure bumpDownwardPressure(const MachineInstr *MI,
+                                      const SIRegisterInfo *TRI) const;
 };
 
 LaneBitmask getLiveLaneMask(unsigned Reg,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index b47cdb2e7ddcf1..e28acd4c07beb6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -173,25 +173,18 @@ static void getRegisterPressures(
   // GCNTrackers
   Pressure.resize(4, 0);
   MachineInstr *MI = SU->getInstr();
+  GCNRegPressure NewPressure;
   if (AtTop) {
     GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
-    TempDownwardTracker.bumpDownwardPressure(MI, SRI);
-    Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
-        TempDownwardTracker.getPressure().getSGPRNum();
-    Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
-        TempDownwardTracker.getPressure().getArchVGPRNum();
-    Pressure[AMDGPU::RegisterPressureSets::AGPR_32] =
-        TempDownwardTracker.getPressure().getAGPRNum();
+    NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
   } else {
     GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
-    TempUpwardTracker.bumpUpwardPressure(MI, SRI);
-    Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
-        TempUpwardTracker.getPressure().getSGPRNum();
-    Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
-        TempUpwardTracker.getPressure().getArchVGPRNum();
-    Pressure[AMDGPU::RegisterPressureSets::AGPR_32] =
-        TempUpwardTracker.getPressure().getAGPRNum();
+    NewPressure = TempUpwardTracker.bumpUpwardPressure(MI, SRI);
   }
+  Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
+  Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
+      NewPressure.getArchVGPRNum();
+  Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
 }
 
 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,

>From 62d058e522f9a5edb9d599cf718772f6e10c3cbe Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Sun, 6 Oct 2024 15:43:51 -0700
Subject: [PATCH 24/26] Fix lit tests

Change-Id: Ie204904f04dc9d2f53d586795c886a3f8c6b1268
---
 llvm/test/CodeGen/AMDGPU/pr51516.mir                          | 4 ++--
 .../CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir
index 49dd5c6c39ff5c..f496a4b06bb237 100644
--- a/llvm/test/CodeGen/AMDGPU/pr51516.mir
+++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
 
 # Check that %3 was not rematerialized before the last store since its operand %1
 # is killed by that store.
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
index 3ce6279f9082fb..34d203e0de2ffa 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,1 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
 
 --- |
   define amdgpu_kernel void @no_sched_metric_due_to_spills() #0 {

>From 59fa570be10bfa78f7b3a060f3ddeb06c586c53b Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 7 Oct 2024 20:55:13 +0200
Subject: [PATCH 25/26] [AMDGPU] Constrain use LiveMask by the operand's
 LaneMask for RP calculation.

CoAuthor: Valery Pykhtin <Valery.Pykhtin at amd.com>

For speculative RP queries, recede may calculate inaccurate masks for subreg uses. Previously, the calcaultion would look at any live lane for the use at the position of the MI in the LIS. This also adds lanes for any subregs which are live at but not used by the instruction. By constraining against the getSubRegIndexLaneMask for the operand's subreg, we are sure to not pick up on these extra lanes.

For current clients of recede, this is not an issue. This is because 1. the current clients do not violate the program order in the LIS, and 2. the change to RP is based on the difference between previous mask and new mask. Since current clients are not exposed to this issue, this patch is sort of NFC.

Change-Id: I63c788785b104ed88297d06e9b2b0dbc0bf6d040
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 52 ++++++++++++++---------
 llvm/lib/Target/AMDGPU/GCNRegPressure.h   |  9 ++--
 2 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index a7a3c65c3388b3..d79959a9852b9d 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -259,7 +259,8 @@ static void
 collectVirtualRegUses(SmallVectorImpl<RegisterMaskPair> &RegMaskPairs,
                       const MachineInstr &MI, const LiveIntervals &LIS,
                       const MachineRegisterInfo &MRI) {
-  SlotIndex InstrSI;
+
+  auto &TRI = *MRI.getTargetRegisterInfo();
   for (const auto &MO : MI.operands()) {
     if (!MO.isReg() || !MO.getReg().isVirtual())
       continue;
@@ -267,25 +268,31 @@ collectVirtualRegUses(SmallVectorImpl<RegisterMaskPair> &RegMaskPairs,
       continue;
 
     Register Reg = MO.getReg();
-    if (llvm::any_of(RegMaskPairs, [Reg](const RegisterMaskPair &RM) {
-          return RM.RegUnit == Reg;
-        }))
-      continue;
+    auto I = llvm::find_if(RegMaskPairs, [Reg](const RegisterMaskPair &RM) {
+      return RM.RegUnit == Reg;
+    });
 
-    LaneBitmask UseMask;
-    auto &LI = LIS.getInterval(Reg);
+    auto &P = I == RegMaskPairs.end()
+                  ? RegMaskPairs.emplace_back(Reg, LaneBitmask::getNone())
+                  : *I;
+
+    P.LaneMask |= MO.getSubReg() ? TRI.getSubRegIndexLaneMask(MO.getSubReg())
+                                 : MRI.getMaxLaneMaskForVReg(Reg);
+  }
+
+  SlotIndex InstrSI;
+  for (auto &P : RegMaskPairs) {
+    auto &LI = LIS.getInterval(P.RegUnit);
     if (!LI.hasSubRanges())
-      UseMask = MRI.getMaxLaneMaskForVReg(Reg);
-    else {
-      // For a tentative schedule LIS isn't updated yet but livemask should
-      // remain the same on any schedule. Subreg defs can be reordered but they
-      // all must dominate uses anyway.
-      if (!InstrSI)
-        InstrSI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex();
-      UseMask = getLiveLaneMask(LI, InstrSI, MRI);
-    }
+      continue;
 
-    RegMaskPairs.emplace_back(Reg, UseMask);
+    // For a tentative schedule LIS isn't updated yet but livemask should
+    // remain the same on any schedule. Subreg defs can be reordered but they
+    // all must dominate uses anyway.
+    if (!InstrSI)
+      InstrSI = LIS.getInstructionIndex(MI).getBaseIndex();
+
+    P.LaneMask = getLiveLaneMask(LI, InstrSI, MRI, P.LaneMask);
   }
 }
 
@@ -390,22 +397,25 @@ static void adjustDefLaneLiveness(SmallVectorImpl<RegisterMaskPair> &Defs,
 
 LaneBitmask llvm::getLiveLaneMask(unsigned Reg, SlotIndex SI,
                                   const LiveIntervals &LIS,
-                                  const MachineRegisterInfo &MRI) {
-  return getLiveLaneMask(LIS.getInterval(Reg), SI, MRI);
+                                  const MachineRegisterInfo &MRI,
+                                  LaneBitmask Mask) {
+  return getLiveLaneMask(LIS.getInterval(Reg), SI, MRI, Mask);
 }
 
 LaneBitmask llvm::getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
-                                  const MachineRegisterInfo &MRI) {
+                                  const MachineRegisterInfo &MRI,
+                                  LaneBitmask Mask) {
   LaneBitmask LiveMask;
   if (LI.hasSubRanges()) {
     for (const auto &S : LI.subranges())
-      if (S.liveAt(SI)) {
+      if ((S.LaneMask & Mask).any() && S.liveAt(SI)) {
         LiveMask |= S.LaneMask;
         assert(LiveMask == (LiveMask & MRI.getMaxLaneMaskForVReg(LI.reg())));
       }
   } else if (LI.liveAt(SI)) {
     LiveMask = MRI.getMaxLaneMaskForVReg(LI.reg());
   }
+  LiveMask &= Mask;
   return LiveMask;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index a583efb457aea6..4ae9dacd799f11 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -320,13 +320,14 @@ class GCNDownwardRPTracker : public GCNRPTracker {
                                       const SIRegisterInfo *TRI) const;
 };
 
-LaneBitmask getLiveLaneMask(unsigned Reg,
-                            SlotIndex SI,
+LaneBitmask getLiveLaneMask(unsigned Reg, SlotIndex SI,
                             const LiveIntervals &LIS,
-                            const MachineRegisterInfo &MRI);
+                            const MachineRegisterInfo &MRI,
+                            LaneBitmask Mask = LaneBitmask::getAll());
 
 LaneBitmask getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
-                            const MachineRegisterInfo &MRI);
+                            const MachineRegisterInfo &MRI,
+                            LaneBitmask Mask = LaneBitmask::getAll());
 
 GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
                                      const MachineRegisterInfo &MRI);

>From f36fe21f3cf69f0c2cd4c62d41e6e1f649f691bb Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 7 Oct 2024 15:20:02 -0700
Subject: [PATCH 26/26] Remove bumpUpwardPressure

Change-Id: I74c8ed0076ff8557d9a23a7ec7b1c9c00290be01
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 73 -----------------------
 llvm/lib/Target/AMDGPU/GCNRegPressure.h   |  8 ---
 2 files changed, 81 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index d79959a9852b9d..f62b426ee89c68 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -296,17 +296,6 @@ collectVirtualRegUses(SmallVectorImpl<RegisterMaskPair> &RegMaskPairs,
   }
 }
 
-/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
-static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
-                               Register RegUnit) {
-  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
-    return Other.RegUnit == RegUnit;
-  });
-  if (I == RegUnits.end())
-    return LaneBitmask::getNone();
-  return I->LaneMask;
-}
-
 /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
 static LaneBitmask getLanesWithProperty(
     const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
@@ -373,25 +362,6 @@ static LaneBitmask getLiveLanesAt(const LiveIntervals &LIS,
       [](const LiveRange &LR, SlotIndex Pos) { return LR.liveAt(Pos); });
 }
 
-// Copy/paste from RegisterPressure.cpp (RegisterOperands::adjustLaneLiveness)
-static void adjustDefLaneLiveness(SmallVectorImpl<RegisterMaskPair> &Defs,
-                                  SlotIndex &Pos, const LiveIntervals &LIS,
-                                  const MachineRegisterInfo &MRI) {
-  for (auto *I = Defs.begin(); I != Defs.end();) {
-    LaneBitmask LiveAfter =
-        getLiveLanesAt(LIS, MRI, true, I->RegUnit, Pos.getDeadSlot());
-    // If the def is all that is live after the instruction, then in case
-    // of a subregister def we need a read-undef flag.
-    LaneBitmask ActualDef = I->LaneMask & LiveAfter;
-    if (ActualDef.none()) {
-      I = Defs.erase(I);
-    } else {
-      I->LaneMask = ActualDef;
-      ++I;
-    }
-  }
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // GCNRPTracker
 
@@ -532,49 +502,6 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   assert(CurPressure == getRegPressure(*MRI, LiveRegs));
 }
 
-GCNRegPressure
-GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI,
-                                       const SIRegisterInfo *TRI) const {
-  assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
-
-  SlotIndex SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot();
-
-  // Account for register pressure similar to RegPressureTracker::recede().
-  RegisterOperands RegOpers;
-
-  RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/true);
-  assert(RegOpers.DeadDefs.empty());
-  adjustDefLaneLiveness(RegOpers.Defs, SlotIdx, LIS, *MRI);
-  RegOpers.detectDeadDefs(*MI, LIS);
-
-  GCNRegPressure TempPressure = CurPressure;
-
-  // Kill liveness at live defs.
-  for (const RegisterMaskPair &P : RegOpers.Defs) {
-    Register Reg = P.RegUnit;
-    if (!Reg.isVirtual())
-      continue;
-    LaneBitmask LiveAfter =
-        LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0);
-    LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, Reg);
-    LaneBitmask DefLanes = P.LaneMask;
-    LaneBitmask LiveBefore = (LiveAfter & ~DefLanes) | UseLanes;
-
-    TempPressure.inc(Reg, LiveAfter, LiveAfter & LiveBefore, *MRI);
-  }
-  // Generate liveness for uses.
-  for (const RegisterMaskPair &P : RegOpers.Uses) {
-    Register Reg = P.RegUnit;
-    if (!Reg.isVirtual())
-      continue;
-    LaneBitmask LiveAfter =
-        LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0);
-    LaneBitmask LiveBefore = LiveAfter | P.LaneMask;
-    TempPressure.inc(Reg, LiveAfter, LiveBefore, *MRI);
-  }
-  return TempPressure;
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // GCNDownwardRPTracker
 
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 4ae9dacd799f11..3c460a8a33883c 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -223,14 +223,6 @@ class GCNUpwardRPTracker : public GCNRPTracker {
   /// to false allows for an externally managed iterator / program order.
   void recede(const MachineInstr &MI);
 
-  /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
-  /// Calculate the impact \p MI will have on CurPressure and MaxPressure. This
-  /// does not rely on the implicit program ordering in the LiveIntervals to
-  /// support RP Speculation. It leaves the state of pressure inconsistent with
-  /// the current position
-  GCNRegPressure bumpUpwardPressure(const MachineInstr *MI,
-                                    const SIRegisterInfo *TRI) const;
-
   /// \p returns whether the tracker's state after receding MI corresponds
   /// to reported by LIS.
   bool isValid() const;



More information about the llvm-commits mailing list