[llvm] [AMDGPU] Optionally Use GCNRPTrackers during scheduling (PR #93090)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 9 09:43:52 PDT 2024


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/93090

>From 276193f872ceb2e2fa82e7a34b7c4a53ec42e55c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 21 May 2024 12:55:07 -0700
Subject: [PATCH] [AMDGPU] Optionally Use AMDGPU RPTrackers during scheduling

---
 .../Target/AMDGPU/GCNIterativeScheduler.cpp   |   2 +-
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp     | 199 +++++-
 llvm/lib/Target/AMDGPU/GCNRegPressure.h       |  96 ++-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 148 +++-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     |  60 +-
 .../CodeGen/AMDGPU/high-RP-reschedule.mir     |  10 +-
 llvm/test/CodeGen/AMDGPU/pr51516.mir          |   6 +-
 .../schedule-amdgpu-tracker-physreg-crash.ll  |  65 ++
 .../AMDGPU/schedule-amdgpu-tracker-physreg.ll | 491 +++++++++++++
 .../AMDGPU/schedule-amdgpu-trackers.ll        | 647 ++++++++++++++++++
 ...schedule-regpressure-ilp-metric-spills.mir |  17 +-
 .../AMDGPU/schedule-relaxed-occupancy.ll      |  10 +-
 12 files changed, 1672 insertions(+), 79 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll

diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 13504508e2fb2e..da065e8d8cb6b8 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -480,7 +480,7 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
   LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
                        "target occupancy = "
                     << TgtOcc << '\n');
-  GCNMaxOccupancySchedStrategy LStrgy(Context);
+  GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler=*/true);
   unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
 
   for (int I = 0; I < NumPasses; ++I) {
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 7c633b2bce7bc2..d46c4cf23a221e 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -296,6 +296,63 @@ collectVirtualRegUses(SmallVectorImpl<RegisterMaskPair> &RegMaskPairs,
   }
 }
 
+/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
+static LaneBitmask getLanesWithProperty(
+    const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
+    bool TrackLaneMasks, Register RegUnit, SlotIndex Pos,
+    LaneBitmask SafeDefault,
+    function_ref<bool(const LiveRange &LR, SlotIndex Pos)> Property) {
+  if (RegUnit.isVirtual()) {
+    const LiveInterval &LI = LIS.getInterval(RegUnit);
+    LaneBitmask Result;
+    if (TrackLaneMasks && LI.hasSubRanges()) {
+      for (const LiveInterval::SubRange &SR : LI.subranges()) {
+        if (Property(SR, Pos))
+          Result |= SR.LaneMask;
+      }
+    } else if (Property(LI, Pos)) {
+      Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit)
+                              : LaneBitmask::getAll();
+    }
+
+    return Result;
+  }
+
+  const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
+  if (LR == nullptr)
+    return SafeDefault;
+  return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone();
+}
+
+/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
+/// Helper to find a vreg use between two indices {PriorUseIdx, NextUseIdx}.
+/// The query starts with a lane bitmask which gets lanes/bits removed for every
+/// use we find.
+static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
+                                  SlotIndex PriorUseIdx, SlotIndex NextUseIdx,
+                                  const MachineRegisterInfo &MRI,
+                                  const SIRegisterInfo *TRI,
+                                  const LiveIntervals *LIS,
+                                  bool Upward = false) {
+  for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+    if (MO.isUndef())
+      continue;
+    const MachineInstr *MI = MO.getParent();
+    SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
+    bool InRange = Upward ? (InstSlot > PriorUseIdx && InstSlot <= NextUseIdx)
+                          : (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx);
+    if (!InRange)
+      continue;
+
+    unsigned SubRegIdx = MO.getSubReg();
+    LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(SubRegIdx);
+    LastUseMask &= ~UseMask;
+    if (LastUseMask.none())
+      return LaneBitmask::getNone();
+  }
+  return LastUseMask;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // GCNRPTracker
 
@@ -354,17 +411,28 @@ void GCNRPTracker::reset(const MachineInstr &MI,
   MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
 }
 
-////////////////////////////////////////////////////////////////////////////////
-// GCNUpwardRPTracker
-
-void GCNUpwardRPTracker::reset(const MachineRegisterInfo &MRI_,
-                               const LiveRegSet &LiveRegs_) {
+void GCNRPTracker::reset(const MachineRegisterInfo &MRI_,
+                         const LiveRegSet &LiveRegs_) {
   MRI = &MRI_;
   LiveRegs = LiveRegs_;
   LastTrackedMI = nullptr;
   MaxPressure = CurPressure = getRegPressure(MRI_, LiveRegs_);
 }
 
+/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
+LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit,
+                                           SlotIndex Pos) const {
+  return getLanesWithProperty(
+      LIS, *MRI, true, RegUnit, Pos.getBaseIndex(), LaneBitmask::getNone(),
+      [](const LiveRange &LR, SlotIndex Pos) {
+        const LiveRange::Segment *S = LR.getSegmentContaining(Pos);
+        return S != nullptr && S->end == Pos.getRegSlot();
+      });
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// GCNUpwardRPTracker
+
 void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   assert(MRI && "call reset first");
 
@@ -441,25 +509,37 @@ bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
   return true;
 }
 
-bool GCNDownwardRPTracker::advanceBeforeNext() {
+bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI,
+                                             bool UseInternalIterator) {
   assert(MRI && "call reset first");
-  if (!LastTrackedMI)
-    return NextMI == MBBEnd;
-
-  assert(NextMI == MBBEnd || !NextMI->isDebugInstr());
+  SlotIndex SI;
+  const MachineInstr *CurrMI;
+  if (UseInternalIterator) {
+    if (!LastTrackedMI)
+      return NextMI == MBBEnd;
+
+    assert(NextMI == MBBEnd || !NextMI->isDebugInstr());
+    CurrMI = LastTrackedMI;
+
+    SI = NextMI == MBBEnd
+             ? LIS.getInstructionIndex(*LastTrackedMI).getDeadSlot()
+             : LIS.getInstructionIndex(*NextMI).getBaseIndex();
+  } else { //! UseInternalIterator
+    SI = LIS.getInstructionIndex(*MI).getBaseIndex();
+    CurrMI = MI;
+  }
 
-  SlotIndex SI = NextMI == MBBEnd
-                     ? LIS.getInstructionIndex(*LastTrackedMI).getDeadSlot()
-                     : LIS.getInstructionIndex(*NextMI).getBaseIndex();
   assert(SI.isValid());
 
   // Remove dead registers or mask bits.
   SmallSet<Register, 8> SeenRegs;
-  for (auto &MO : LastTrackedMI->operands()) {
+  for (auto &MO : CurrMI->operands()) {
     if (!MO.isReg() || !MO.getReg().isVirtual())
       continue;
     if (MO.isUse() && !MO.readsReg())
       continue;
+    if (!UseInternalIterator && MO.isDef())
+      continue;
     if (!SeenRegs.insert(MO.getReg()).second)
       continue;
     const LiveInterval &LI = LIS.getInterval(MO.getReg());
@@ -492,15 +572,22 @@ bool GCNDownwardRPTracker::advanceBeforeNext() {
 
   LastTrackedMI = nullptr;
 
-  return NextMI == MBBEnd;
+  return UseInternalIterator && (NextMI == MBBEnd);
 }
 
-void GCNDownwardRPTracker::advanceToNext() {
-  LastTrackedMI = &*NextMI++;
-  NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+void GCNDownwardRPTracker::advanceToNext(MachineInstr *MI,
+                                         bool UseInternalIterator) {
+  if (UseInternalIterator) {
+    LastTrackedMI = &*NextMI++;
+    NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+  } else {
+    LastTrackedMI = MI;
+  }
+
+  const MachineInstr *CurrMI = LastTrackedMI;
 
   // Add new registers or mask bits.
-  for (const auto &MO : LastTrackedMI->all_defs()) {
+  for (const auto &MO : CurrMI->all_defs()) {
     Register Reg = MO.getReg();
     if (!Reg.isVirtual())
       continue;
@@ -513,11 +600,16 @@ void GCNDownwardRPTracker::advanceToNext() {
   MaxPressure = max(MaxPressure, CurPressure);
 }
 
-bool GCNDownwardRPTracker::advance() {
-  if (NextMI == MBBEnd)
+bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator) {
+  if (UseInternalIterator && NextMI == MBBEnd)
     return false;
-  advanceBeforeNext();
-  advanceToNext();
+
+  advanceBeforeNext(MI, UseInternalIterator);
+  advanceToNext(MI, UseInternalIterator);
+  if (!UseInternalIterator) {
+    // We must remove any dead def lanes from the current RP
+    advanceBeforeNext(MI, true);
+  }
   return true;
 }
 
@@ -559,6 +651,67 @@ Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
   });
 }
 
+GCNRegPressure
+GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
+                                           const SIRegisterInfo *TRI) const {
+  assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
+
+  SlotIndex SlotIdx;
+  SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot();
+
+  // Account for register pressure similar to RegPressureTracker::recede().
+  RegisterOperands RegOpers;
+  RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/false);
+  RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
+  GCNRegPressure TempPressure = CurPressure;
+
+  for (const RegisterMaskPair &Use : RegOpers.Uses) {
+    Register Reg = Use.RegUnit;
+    if (!Reg.isVirtual())
+      continue;
+    LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
+    if (LastUseMask.none())
+      continue;
+    // The LastUseMask is queried from the liveness information of instruction
+    // which may be further down the schedule. Some lanes may actually not be
+    // last uses for the current position.
+    // FIXME: allow the caller to pass in the list of vreg uses that remain
+    // to be bottom-scheduled to avoid searching uses at each query.
+    SlotIndex CurrIdx;
+    const MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::const_iterator IdxPos = skipDebugInstructionsForward(
+        LastTrackedMI ? LastTrackedMI : MBB->begin(), MBB->end());
+    if (IdxPos == MBB->end()) {
+      CurrIdx = LIS.getMBBEndIdx(MBB);
+    } else {
+      CurrIdx = LIS.getInstructionIndex(*IdxPos).getRegSlot();
+    }
+
+    LastUseMask =
+        findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, TRI, &LIS);
+    if (LastUseMask.none())
+      continue;
+
+    LaneBitmask LiveMask =
+        LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0);
+    LaneBitmask NewMask = LiveMask & ~LastUseMask;
+    TempPressure.inc(Reg, LiveMask, NewMask, *MRI);
+  }
+
+  // Generate liveness for defs.
+  for (const RegisterMaskPair &Def : RegOpers.Defs) {
+    Register Reg = Def.RegUnit;
+    if (!Reg.isVirtual())
+      continue;
+    LaneBitmask LiveMask =
+        LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0);
+    LaneBitmask NewMask = LiveMask | Def.LaneMask;
+    TempPressure.inc(Reg, LiveMask, NewMask, *MRI);
+  }
+
+  return TempPressure;
+}
+
 bool GCNUpwardRPTracker::isValid() const {
   const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex();
   const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index d419fcc802c60a..06c3d9027db1b5 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -19,6 +19,7 @@
 
 #include "GCNSubtarget.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/RegisterPressure.h"
 #include <algorithm>
 
 namespace llvm {
@@ -149,6 +150,9 @@ inline GCNRegPressure operator-(const GCNRegPressure &P1,
   return Diff;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// GCNRPTracker
+
 class GCNRPTracker {
 public:
   using LiveRegSet = DenseMap<unsigned, LaneBitmask>;
@@ -165,7 +169,14 @@ class GCNRPTracker {
   void reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy,
              bool After);
 
+  /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
+  void bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs);
+
+  LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const;
+
 public:
+  // reset tracker and set live register set to the specified value.
+  void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_);
   // live regs for the current state
   const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
   const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; }
@@ -182,34 +193,38 @@ class GCNRPTracker {
 GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
                                      const MachineRegisterInfo &MRI);
 
+////////////////////////////////////////////////////////////////////////////////
+// GCNUpwardRPTracker
+
 class GCNUpwardRPTracker : public GCNRPTracker {
 public:
   GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
 
-  // reset tracker and set live register set to the specified value.
-  void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_);
+  using GCNRPTracker::reset;
 
-  // reset tracker at the specified slot index.
+  /// reset tracker at the specified slot index \p SI.
   void reset(const MachineRegisterInfo &MRI, SlotIndex SI) {
-    reset(MRI, llvm::getLiveRegs(SI, LIS, MRI));
+    GCNRPTracker::reset(MRI, llvm::getLiveRegs(SI, LIS, MRI));
   }
 
-  // reset tracker to the end of the MBB.
+  /// reset tracker to the end of the \p MBB.
   void reset(const MachineBasicBlock &MBB) {
     reset(MBB.getParent()->getRegInfo(),
           LIS.getSlotIndexes()->getMBBEndIdx(&MBB));
   }
 
-  // reset tracker to the point just after MI (in program order).
+  /// reset tracker to the point just after \p MI (in program order).
   void reset(const MachineInstr &MI) {
     reset(MI.getMF()->getRegInfo(), LIS.getInstructionIndex(MI).getDeadSlot());
   }
 
-  // move to the state just before the MI (in program order).
+  /// Move to the state of RP just before the \p MI . If \p UseInternalIterator
+  /// is set, also update the internal iterators. Setting \p UseInternalIterator
+  /// to false allows for an externally managed iterator / program order.
   void recede(const MachineInstr &MI);
 
-  // checks whether the tracker's state after receding MI corresponds
-  // to reported by LIS.
+  /// \p returns whether the tracker's state after receding MI corresponds
+  /// to reported by LIS.
   bool isValid() const;
 
   const GCNRegPressure &getMaxPressure() const { return MaxPressure; }
@@ -223,6 +238,9 @@ class GCNUpwardRPTracker : public GCNRPTracker {
   }
 };
 
+////////////////////////////////////////////////////////////////////////////////
+// GCNDownwardRPTracker
+
 class GCNDownwardRPTracker : public GCNRPTracker {
   // Last position of reset or advanceBeforeNext
   MachineBasicBlock::const_iterator NextMI;
@@ -232,37 +250,65 @@ class GCNDownwardRPTracker : public GCNRPTracker {
 public:
   GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
 
+  using GCNRPTracker::reset;
+
   MachineBasicBlock::const_iterator getNext() const { return NextMI; }
 
-  // Return MaxPressure and clear it.
+  /// \p return MaxPressure and clear it.
   GCNRegPressure moveMaxPressure() {
     auto Res = MaxPressure;
     MaxPressure.clear();
     return Res;
   }
 
-  // Reset tracker to the point before the MI
-  // filling live regs upon this point using LIS.
-  // Returns false if block is empty except debug values.
+  /// Reset tracker to the point before the \p MI
+  /// filling \p LiveRegs upon this point using LIS.
+  /// \p returns false if block is empty except debug values.
   bool reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
 
-  // Move to the state right before the next MI or after the end of MBB.
-  // Returns false if reached end of the block.
-  bool advanceBeforeNext();
-
-  // Move to the state at the MI, advanceBeforeNext has to be called first.
-  void advanceToNext();
-
-  // Move to the state at the next MI. Returns false if reached end of block.
-  bool advance();
-
-  // Advance instructions until before End.
+  /// Move to the state right before the next MI or after the end of MBB.
+  /// \p returns false if reached end of the block.
+  /// If \p UseInternalIterator is true, then internal iterators are used and
+  /// set to process in program order. If \p UseInternalIterator is false, then
+  /// it is assumed that the tracker is using an externally managed iterator,
+  /// and advance* calls will not update the state of the iterator. In such
+  /// cases, the tracker will move to the state right before the provided \p MI
+  /// and use LIS for RP calculations.
+  bool advanceBeforeNext(MachineInstr *MI = nullptr,
+                         bool UseInternalIterator = true);
+
+  /// Move to the state at the MI, advanceBeforeNext has to be called first.
+  /// If \p UseInternalIterator is true, then internal iterators are used and
+  /// set to process in program order. If \p UseInternalIterator is false, then
+  /// it is assumed that the tracker is using an externally managed iterator,
+  /// and advance* calls will not update the state of the iterator. In such
+  /// cases, the tracker will move to the state at the provided \p MI .
+  void advanceToNext(MachineInstr *MI = nullptr,
+                     bool UseInternalIterator = true);
+
+  /// Move to the state at the next MI. \p returns false if reached end of
+  /// block. If \p UseInternalIterator is true, then internal iterators are used
+  /// and set to process in program order. If \p UseInternalIterator is false,
+  /// then it is assumed that the tracker is using an externally managed
+  /// iterator, and advance* calls will not update the state of the iterator. In
+  /// such cases, the tracker will move to the state right before the provided
+  /// \p MI and use LIS for RP calculations.
+  bool advance(MachineInstr *MI = nullptr, bool UseInternalIterator = true);
+
+  /// Advance instructions until before \p End.
   bool advance(MachineBasicBlock::const_iterator End);
 
-  // Reset to Begin and advance to End.
+  /// Reset to \p Begin and advance to \p End.
   bool advance(MachineBasicBlock::const_iterator Begin,
                MachineBasicBlock::const_iterator End,
                const LiveRegSet *LiveRegsCopy = nullptr);
+
+  /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
+  /// Calculate the impact \p MI will have on CurPressure and \return the
+  /// speculated pressure. In order to support RP Speculation, this does not
+  /// rely on the implicit program ordering in the LiveIntervals.
+  GCNRegPressure bumpDownwardPressure(const MachineInstr *MI,
+                                      const SIRegisterInfo *TRI) const;
 };
 
 /// \returns the LaneMask of live lanes of \p Reg at position \p SI. Only the
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index d6958d9055fade..11c95675aeeafa 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -58,11 +58,17 @@ static cl::opt<bool>
                         "Wave Limited (amdgpu-limit-wave-threshold)."),
                cl::init(false));
 
+static cl::opt<bool> GCNTrackers(
+    "amdgpu-use-amdgpu-trackers", cl::Hidden,
+    cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
+    cl::init(false));
+
 const unsigned ScheduleMetrics::ScaleFactor = 100;
 
 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
     : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
-      HasHighPressure(false) {}
+      DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
+}
 
 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
   GenericScheduler::initialize(DAG);
@@ -148,17 +154,38 @@ static bool canUsePressureDiffs(const SUnit &SU) {
   return true;
 }
 
-static void getRegisterPressures(bool AtTop,
-                                 const RegPressureTracker &RPTracker, SUnit *SU,
-                                 std::vector<unsigned> &Pressure,
-                                 std::vector<unsigned> &MaxPressure) {
+static void getRegisterPressures(
+    bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
+    std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
+    GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
+    ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
   // getDownwardPressure() and getUpwardPressure() make temporary changes to
   // the tracker, so we need to pass those function a non-const copy.
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
-  if (AtTop)
-    TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
-  else
-    TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+  if (!GCNTrackers) {
+    AtTop
+        ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
+        : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+
+    return;
+  }
+
+  // GCNTrackers
+  Pressure.resize(4, 0);
+  MachineInstr *MI = SU->getInstr();
+  GCNRegPressure NewPressure;
+  if (AtTop) {
+    GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
+    NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
+  } else {
+    GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
+    TempUpwardTracker.recede(*MI);
+    NewPressure = TempUpwardTracker.getPressure();
+  }
+  Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
+  Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
+      NewPressure.getArchVGPRNum();
+  Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
 }
 
 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
@@ -187,8 +214,9 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
   //
   // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
   // PressureDiffs.
-  if (AtTop || !canUsePressureDiffs(*SU)) {
-    getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure);
+  if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
+    getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
+                         DownwardTracker, UpwardTracker, DAG, SRI);
   } else {
     // Reserve 4 slots.
     Pressure.resize(4, 0);
@@ -206,7 +234,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
 
 #ifdef EXPENSIVE_CHECKS
     std::vector<unsigned> CheckPressure, CheckMaxPressure;
-    getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure);
+    getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
+                         TheTracker, UpwardTracker, DAG, SRI);
     if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
             CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
         Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
@@ -294,8 +323,16 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
   unsigned SGPRPressure = 0;
   unsigned VGPRPressure = 0;
   if (DAG->isTrackingPressure()) {
-    SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
-    VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+    if (!GCNTrackers) {
+      SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
+      VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+    } else {
+      GCNRPTracker *T = IsBottomUp
+                            ? static_cast<GCNRPTracker *>(&UpwardTracker)
+                            : static_cast<GCNRPTracker *>(&DownwardTracker);
+      SGPRPressure = T->getPressure().getSGPRNum();
+      VGPRPressure = T->getPressure().getArchVGPRNum();
+    }
   }
   ReadyQueue &Q = Zone.Available;
   for (SUnit *SU : Q) {
@@ -444,6 +481,16 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
   return SU;
 }
 
+void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+  if (GCNTrackers) {
+    MachineInstr *MI = SU->getInstr();
+    IsTopNode ? (void)DownwardTracker.advance(MI, false)
+              : UpwardTracker.recede(*MI);
+  }
+
+  return GenericScheduler::schedNode(SU, IsTopNode);
+}
+
 GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
   assert(CurrentStage && CurrentStage != SchedStages.end());
   return *CurrentStage;
@@ -470,12 +517,13 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const {
 }
 
 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
-    const MachineSchedContext *C)
+    const MachineSchedContext *C, bool IsLegacyScheduler)
     : GCNSchedStrategy(C) {
   SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
   SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
   SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
+  GCNTrackers = GCNTrackers & !IsLegacyScheduler;
 }
 
 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
@@ -571,7 +619,8 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(
     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
     : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
       MFI(*MF.getInfo<SIMachineFunctionInfo>()),
-      StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) {
+      StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
+      RegionLiveOuts(this, /*IsLiveOut=*/true) {
 
   LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
   if (RelaxedOcc) {
@@ -613,6 +662,14 @@ GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
   return RPTracker.moveMaxPressure();
 }
 
+static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
+                                        MachineBasicBlock::iterator RegionEnd) {
+  auto REnd = RegionEnd == RegionBegin->getParent()->end()
+                  ? std::prev(RegionEnd)
+                  : RegionEnd;
+  return &*skipDebugInstructionsBackward(REnd, RegionBegin);
+}
+
 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
                                                 const MachineBasicBlock *MBB) {
   GCNDownwardRPTracker RPTracker(*LIS);
@@ -687,20 +744,45 @@ void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
 }
 
 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
-GCNScheduleDAGMILive::getBBLiveInMap() const {
+GCNScheduleDAGMILive::getRegionLiveInMap() const {
   assert(!Regions.empty());
-  std::vector<MachineInstr *> BBStarters;
-  BBStarters.reserve(Regions.size());
+  std::vector<MachineInstr *> RegionFirstMIs;
+  RegionFirstMIs.reserve(Regions.size());
   auto I = Regions.rbegin(), E = Regions.rend();
   auto *BB = I->first->getParent();
   do {
     auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
-    BBStarters.push_back(MI);
+    RegionFirstMIs.push_back(MI);
     do {
       ++I;
     } while (I != E && I->first->getParent() == BB);
   } while (I != E);
-  return getLiveRegMap(BBStarters, false /*After*/, *LIS);
+  return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
+}
+
+DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
+GCNScheduleDAGMILive::getRegionLiveOutMap() const {
+  assert(!Regions.empty());
+  std::vector<MachineInstr *> RegionLastMIs;
+  RegionLastMIs.reserve(Regions.size());
+  for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
+    RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
+
+  return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
+}
+
+void RegionPressureMap::buildLiveRegMap() {
+  IdxToInstruction.clear();
+
+  RegionLiveRegMap =
+      IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
+  for (unsigned I = 0; I < DAG->Regions.size(); I++) {
+    MachineInstr *RegionKey =
+        IsLiveOut
+            ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second)
+            : &*DAG->Regions[I].first;
+    IdxToInstruction[I] = RegionKey;
+  }
 }
 
 void GCNScheduleDAGMILive::finalizeSchedule() {
@@ -726,8 +808,11 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
 void GCNScheduleDAGMILive::runSchedStages() {
   LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
 
-  if (!Regions.empty())
-    BBLiveInMap = getBBLiveInMap();
+  if (!Regions.empty()) {
+    BBLiveInMap = getRegionLiveInMap();
+    if (GCNTrackers)
+      RegionLiveOuts.buildLiveRegMap();
+  }
 
   GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
   while (S.advanceStage()) {
@@ -745,6 +830,19 @@ void GCNScheduleDAGMILive::runSchedStages() {
         continue;
       }
 
+      if (GCNTrackers) {
+        GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
+        GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
+        GCNRPTracker::LiveRegSet *RegionLiveIns =
+            &LiveIns[Stage->getRegionIdx()];
+
+        reinterpret_cast<GCNRPTracker *>(DownwardTracker)
+            ->reset(MRI, *RegionLiveIns);
+        reinterpret_cast<GCNRPTracker *>(UpwardTracker)
+            ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
+                             Stage->getRegionIdx()));
+      }
+
       ScheduleDAGMILive::schedule();
       Stage->finalizeGCNRegion();
     }
@@ -1015,6 +1113,7 @@ void GCNSchedStage::finalizeGCNRegion() {
 void GCNSchedStage::checkScheduling() {
   // Check the results of scheduling.
   PressureAfter = DAG.getRealRegPressure(RegionIdx);
+
   LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
 
@@ -1586,6 +1685,9 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
   DAG.Regions = NewRegions;
   DAG.RescheduleRegions = NewRescheduleRegions;
 
+  if (GCNTrackers)
+    DAG.RegionLiveOuts.buildLiveRegMap();
+
   SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
   MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index f0aea2bc4ab865..64d517038f90e0 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -70,6 +70,12 @@ class GCNSchedStrategy : public GenericScheduler {
   // Pointer to the current SchedStageID.
   SmallVectorImpl<GCNSchedStageID>::iterator CurrentStage = nullptr;
 
+  // GCN RP Tracker for top-down scheduling
+  mutable GCNDownwardRPTracker DownwardTracker;
+
+  // GCN RP Tracker for botttom-up scheduling
+  mutable GCNUpwardRPTracker UpwardTracker;
+
 public:
   // schedule() have seen register pressure over the critical limits and had to
   // track register pressure for actual scheduling heuristics.
@@ -102,6 +108,8 @@ class GCNSchedStrategy : public GenericScheduler {
 
   SUnit *pickNode(bool &IsTopNode) override;
 
+  void schedNode(SUnit *SU, bool IsTopNode) override;
+
   void initialize(ScheduleDAGMI *DAG) override;
 
   unsigned getTargetOccupancy() { return TargetOccupancy; }
@@ -116,13 +124,18 @@ class GCNSchedStrategy : public GenericScheduler {
   bool hasNextStage() const;
 
   GCNSchedStageID getNextStage() const;
+
+  GCNDownwardRPTracker *getDownwardTracker() { return &DownwardTracker; }
+
+  GCNUpwardRPTracker *getUpwardTracker() { return &UpwardTracker; }
 };
 
 /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e.
 /// maximum number of waves per simd).
 class GCNMaxOccupancySchedStrategy final : public GCNSchedStrategy {
 public:
-  GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
+  GCNMaxOccupancySchedStrategy(const MachineSchedContext *C,
+                               bool IsLegacyScheduler = false);
 };
 
 /// The goal of this scheduling strategy is to maximize ILP for a single wave
@@ -163,6 +176,32 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) {
   return OS;
 }
 
+class GCNScheduleDAGMILive;
+class RegionPressureMap {
+  GCNScheduleDAGMILive *DAG;
+  // The live in/out pressure as indexed by the first or last MI in the region
+  // before scheduling.
+  DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> RegionLiveRegMap;
+  // The mapping of RegionIDx to key instruction
+  DenseMap<unsigned, MachineInstr *> IdxToInstruction;
+  // Whether we are calculating LiveOuts or LiveIns
+  bool IsLiveOut;
+
+public:
+  RegionPressureMap() {}
+  RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut)
+      : DAG(GCNDAG), IsLiveOut(LiveOut) {}
+  // Build the Instr->LiveReg and RegionIdx->Instr maps
+  void buildLiveRegMap();
+
+  // Retrieve the LiveReg for a given RegionIdx
+  GCNRPTracker::LiveRegSet &getLiveRegsForRegionIdx(unsigned RegionIdx) {
+    assert(IdxToInstruction.find(RegionIdx) != IdxToInstruction.end());
+    MachineInstr *Key = IdxToInstruction[RegionIdx];
+    return RegionLiveRegMap[Key];
+  }
+};
+
 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   friend class GCNSchedStage;
   friend class OccInitialScheduleStage;
@@ -170,6 +209,7 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   friend class ClusteredLowOccStage;
   friend class PreRARematStage;
   friend class ILPInitialScheduleStage;
+  friend class RegionPressureMap;
 
   const GCNSubtarget &ST;
 
@@ -211,9 +251,22 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   // Temporary basic block live-in cache.
   DenseMap<const MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBLiveIns;
 
+  // The map of the initial first region instruction to region live in registers
   DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
 
-  DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
+  // Calculate the map of the initial first region instruction to region live in
+  // registers
+  DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getRegionLiveInMap() const;
+
+  // Calculate the map of the initial last region instruction to region live out
+  // registers
+  DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
+  getRegionLiveOutMap() const;
+
+  // The live out registers per region. These are internally stored as a map of
+  // the initial last region instruction to region live out registers, but can
+  // be retreived with the regionIdx by calls to getLiveRegsForRegionIdx.
+  RegionPressureMap RegionLiveOuts;
 
   // Return current region pressure.
   GCNRegPressure getRealRegPressure(unsigned RegionIdx) const;
@@ -311,6 +364,9 @@ class GCNSchedStage {
     return DAG.RegionsWithExcessRP[RegionIdx];
   }
 
+  // The region number this stage is currently working on
+  unsigned getRegionIdx() { return RegionIdx; }
+
   // Returns true if the new schedule may result in more spilling.
   bool mayCauseSpilling(unsigned WavesAfter);
 
diff --git a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir
index e9005e94ce5db7..d57450baea911a 100644
--- a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir
+++ b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir
@@ -1,11 +1,17 @@
 # REQUIRES: asserts
-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -amdgpu-use-amdgpu-trackers=1 -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN-GCNTRACKER %s
 
 --- |
   define amdgpu_kernel void @high-RP-reschedule() { ret void }
 ...
 
-# CHECK: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4
+# GCN: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4
+
+# GCN-GCNTRACKER: Occupancy before scheduling: 3, after 4.
+# GCN-GCNTRACKER-NEXT: Ending scheduling stage: Max Occupancy Initial Schedule
+
+# When using the GCN Trackers, the scheduler is able to acieve desired occupancy without running high-RP-reschedule stage.
 
 ---
 name: high-RP-reschedule
diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir
index 4be102f7860eab..f496a4b06bb237 100644
--- a/llvm/test/CodeGen/AMDGPU/pr51516.mir
+++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir
@@ -1,4 +1,5 @@
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
 
 # Check that %3 was not rematerialized before the last store since its operand %1
 # is killed by that store.
@@ -7,6 +8,9 @@
 # GCN: renamable $vgpr33_vgpr34_vgpr35_vgpr36 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
 # GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr47, killed renamable $vgpr29_vgpr30_vgpr31_vgpr32, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr46
 
+# GCN-GCNTRACKER-LABEL: name: global_sextload_v32i32_to_v32i64
+# GCN-GCNTRACKER-NOT: SI_SPILL
+
 ---
 name:            global_sextload_v32i32_to_v32i64
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll
new file mode 100644
index 00000000000000..79187f51af0d2b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll
@@ -0,0 +1,65 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack -amdgpu-use-amdgpu-trackers=1  2>&1  < %s | FileCheck -check-prefixes=ERR-GCNTRACKERS %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack 2>&1  < %s | FileCheck -check-prefixes=GCN %s
+
+%asm.output = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs
+                     <16 x i32>, <7 x i32>, ; vgprs
+                     i64 ; vcc
+                     }
+
+%asm.output2 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs
+                     <16 x i32>, <5 x i32>, ; vgprs
+                     i64 ; vcc
+                     }
+
+%asm.output3 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, ; sgprs
+                     <16 x i32>, <6 x i32>, ; vgprs
+                     i64 ; vcc
+                     }
+
+; ERR-GCNTRACKERS: ran out of registers during register allocation
+; GCN-NOT: ran out of registers during register allocation
+
+; FIXME: GCN Trackers do not track pressure from PhysRegs, so scheduling is actually worse
+
+define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 {
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  %alloca1 = alloca i32, align 4, addrspace(5)
+  call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+
+  %asm = call %asm.output asm sideeffect
+    "; def $0, $1, $2, $3, $4, $5, $6, $7, $8",
+    "={s[0:15]},={s[16:31]},={s[32:47]},={s[48:55]},={s[56:57]},={s58},={v[0:15]},={v[16:22]},={vcc}"()
+
+  %s0 = extractvalue %asm.output %asm, 0
+  %s1 = extractvalue %asm.output %asm, 1
+  %s2 = extractvalue %asm.output %asm, 2
+  %s3 = extractvalue %asm.output %asm, 3
+  %s4 = extractvalue %asm.output %asm, 4
+  %s5 = extractvalue %asm.output %asm, 5
+
+  %v0 = extractvalue %asm.output %asm, 6
+  %v1 = extractvalue %asm.output %asm, 7
+
+  %vcc = extractvalue %asm.output %asm, 8
+
+  ; scc is unavailable since it is live in
+  call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10",
+                           "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:22]},{vcc},{s59},{scc}"(
+    <16 x i32> %s0,
+    <16 x i32> %s1,
+    <16 x i32> %s2,
+    <8 x i32> %s3,
+    <2 x i32> %s4,
+    i32 %s5,
+    <16 x i32> %v0,
+    <7 x i32> %v1,
+    i64 %vcc,
+    ptr addrspace(5) %alloca1,
+    i32 0) ; use of scc
+
+  ret void
+}
+
+attributes #0 = { nounwind alignstack=64 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" }
+attributes #1 = { nounwind alignstack=16 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" }
+
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
new file mode 100644
index 00000000000000..c490c76f4531de
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
@@ -0,0 +1,491 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0  < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 -amdgpu-use-amdgpu-trackers=1  < %s | FileCheck --check-prefix=GCN-GCNTRACKERS %s
+
+; CHECK-LABEL: {{^}}spill:
+; GCN:    codeLenInByte = 1000
+; GCN-GCNTRACKERS:    codeLenInByte = 1016
+; GCN:    NumSgprs: 104
+; GCN-GCNTRACKERS:    NumSgprs: 104
+; GCN:    NumVgprs: 1
+; GCN-GCNTRACKERS:    NumVgprs: 2
+; GCN:    ScratchSize: 0
+; GCN-GCNTRACKERS:    ScratchSize: 0
+; GCN:    Occupancy: 5
+; GCN-GCNTRACKERS:    Occupancy: 5
+
+; FIXME: GCN Trackers do not track pressure from PhysRegs, so scheduling is actually worse
+
+define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 {
+entry:
+  %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #0
+  %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #0
+  %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #0
+  %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #0
+  %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #0
+  %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #0
+  %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #0
+  %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #0
+  %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #0
+  %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #0
+  %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #0
+  %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #0
+  %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #0
+  %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #0
+  %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #0
+  %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #0
+  %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #0
+  %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #0
+  %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #0
+  %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #0
+  %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #0
+  %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #0
+  %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #0
+  %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #0
+  %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #0
+  %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #0
+  %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #0
+  %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #0
+  %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #0
+  %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #0
+  %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #0
+  %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #0
+  %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #0
+  %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #0
+  %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #0
+  %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #0
+  %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #0
+  %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #0
+  %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #0
+  %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #0
+  %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #0
+  %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #0
+  %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #0
+  %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #0
+  %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #0
+  %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #0
+  %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #0
+  %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #0
+  %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #0
+  %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #0
+  %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #0
+  %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #0
+  %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #0
+  %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #0
+  %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #0
+  %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #0
+  %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #0
+  %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #0
+  %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #0
+  %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #0
+  %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #0
+  %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #0
+  %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #0
+  %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #0
+  %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #0
+  %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #0
+  %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #0
+  %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #0
+  %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #0
+  %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #0
+  %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #0
+  %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #0
+  %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #0
+  %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #0
+  %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #0
+  %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #0
+  %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #0
+  %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #0
+  %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #0
+  %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #0
+  %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #0
+  %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #0
+  %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #0
+  %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #0
+  %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #0
+  %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #0
+  %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #0
+  %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #0
+  %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #0
+  %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #0
+  %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #0
+  %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #0
+  %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #0
+  %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #0
+  %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #0
+  %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #0
+  %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #0
+  %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #0
+  %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #0
+  %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #0
+  %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #0
+  %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #0
+  %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #0
+  %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #0
+  %cmp = icmp eq i32 %cnd, 0
+  br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
+
+bb2: ; 68 bytes
+  ; 64 byte asm
+  call void asm sideeffect
+   "v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64",""() #0
+  br label %bb3
+
+bb3:
+  tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #0
+  tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #0
+  tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #0
+  tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #0
+  tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #0
+  tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #0
+  tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #0
+  tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #0
+  tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #0
+  tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #0
+  tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #0
+  tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #0
+  tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #0
+  tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #0
+  tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #0
+  tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #0
+  tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #0
+  tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #0
+  tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #0
+  tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #0
+  tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #0
+  tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #0
+  tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #0
+  tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #0
+  tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #0
+  tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #0
+  tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #0
+  tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #0
+  tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #0
+  tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #0
+  tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #0
+  tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #0
+  tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #0
+  tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #0
+  tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #0
+  tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #0
+  tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #0
+  tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #0
+  tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #0
+  tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #0
+  tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #0
+  tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #0
+  tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #0
+  tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #0
+  tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #0
+  tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #0
+  tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #0
+  tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #0
+  tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #0
+  tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #0
+  tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #0
+  tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #0
+  tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #0
+  tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #0
+  tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #0
+  tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #0
+  tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #0
+  tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #0
+  tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #0
+  tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #0
+  tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #0
+  tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #0
+  tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #0
+  tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #0
+  tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #0
+  tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #0
+  tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #0
+  tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #0
+  tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #0
+  tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #0
+  tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #0
+  tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #0
+  tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #0
+  tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #0
+  tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #0
+  tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #0
+  tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #0
+  tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #0
+  tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #0
+  tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #0
+  tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #0
+  tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #0
+  tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #0
+  tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #0
+  tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #0
+  tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #0
+  tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #0
+  tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #0
+  tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #0
+  tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #0
+  tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #0
+  tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #0
+  tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #0
+  tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #0
+  tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #0
+  tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #0
+  tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #0
+  tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #0
+  tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #0
+  tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #0
+  tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #0
+  tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #0
+  tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #0
+  tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #0
+  ret void
+}
+
+; CHECK-LABEL: {{^}}spill_func:
+; GCN:    codeLenInByte = 1612
+; GCN-GCNTRACKERS:    codeLenInByte = 1660
+; GCN:    NumSgprs: 104
+; GCN-GCNTRACKERS:    NumSgprs: 104
+; GCN:    NumVgprs: 3
+; GCN-GCNTRACKERS:    NumVgprs: 4
+; GCN:    ScratchSize: 12
+; GCN-GCNTRACKERS:    ScratchSize: 16
+
+define void @spill_func(ptr addrspace(1) %arg) #0 {
+entry:
+  %cnd = tail call i32 @llvm.amdgcn.workgroup.id.x() #0
+  %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #0
+  %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #0
+  %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #0
+  %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #0
+  %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #0
+  %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #0
+  %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #0
+  %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #0
+  %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #0
+  %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #0
+  %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #0
+  %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #0
+  %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #0
+  %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #0
+  %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #0
+  %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #0
+  %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #0
+  %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #0
+  %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #0
+  %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #0
+  %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #0
+  %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #0
+  %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #0
+  %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #0
+  %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #0
+  %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #0
+  %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #0
+  %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #0
+  %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #0
+  %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #0
+  %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #0
+  %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #0
+  %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #0
+  %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #0
+  %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #0
+  %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #0
+  %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #0
+  %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #0
+  %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #0
+  %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #0
+  %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #0
+  %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #0
+  %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #0
+  %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #0
+  %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #0
+  %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #0
+  %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #0
+  %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #0
+  %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #0
+  %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #0
+  %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #0
+  %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #0
+  %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #0
+  %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #0
+  %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #0
+  %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #0
+  %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #0
+  %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #0
+  %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #0
+  %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #0
+  %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #0
+  %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #0
+  %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #0
+  %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #0
+  %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #0
+  %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #0
+  %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #0
+  %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #0
+  %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #0
+  %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #0
+  %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #0
+  %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #0
+  %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #0
+  %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #0
+  %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #0
+  %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #0
+  %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #0
+  %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #0
+  %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #0
+  %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #0
+  %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #0
+  %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #0
+  %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #0
+  %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #0
+  %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #0
+  %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #0
+  %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #0
+  %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #0
+  %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #0
+  %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #0
+  %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #0
+  %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #0
+  %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #0
+  %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #0
+  %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #0
+  %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #0
+  %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #0
+  %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #0
+  %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #0
+  %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #0
+  %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #0
+  %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #0
+  %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #0
+  %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #0
+  %cmp = icmp eq i32 %cnd, 0
+  br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
+
+bb2: ; 68 bytes
+  ; 64 byte asm
+  call void asm sideeffect
+   "v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64",""() #0
+  br label %bb3
+
+bb3:
+  tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #0
+  tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #0
+  tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #0
+  tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #0
+  tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #0
+  tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #0
+  tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #0
+  tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #0
+  tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #0
+  tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #0
+  tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #0
+  tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #0
+  tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #0
+  tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #0
+  tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #0
+  tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #0
+  tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #0
+  tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #0
+  tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #0
+  tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #0
+  tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #0
+  tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #0
+  tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #0
+  tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #0
+  tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #0
+  tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #0
+  tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #0
+  tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #0
+  tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #0
+  tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #0
+  tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #0
+  tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #0
+  tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #0
+  tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #0
+  tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #0
+  tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #0
+  tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #0
+  tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #0
+  tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #0
+  tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #0
+  tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #0
+  tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #0
+  tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #0
+  tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #0
+  tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #0
+  tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #0
+  tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #0
+  tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #0
+  tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #0
+  tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #0
+  tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #0
+  tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #0
+  tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #0
+  tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #0
+  tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #0
+  tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #0
+  tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #0
+  tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #0
+  tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #0
+  tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #0
+  tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #0
+  tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #0
+  tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #0
+  tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #0
+  tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #0
+  tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #0
+  tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #0
+  tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #0
+  tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #0
+  tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #0
+  tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #0
+  tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #0
+  tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #0
+  tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #0
+  tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #0
+  tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #0
+  tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #0
+  tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #0
+  tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #0
+  tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #0
+  tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #0
+  tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #0
+  tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #0
+  tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #0
+  tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #0
+  tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #0
+  tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #0
+  tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #0
+  tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #0
+  tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #0
+  tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #0
+  tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #0
+  tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #0
+  tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #0
+  tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #0
+  tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #0
+  tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #0
+  tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #0
+  tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #0
+  tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #0
+  tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #0
+  tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #0
+  tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #0
+  tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #0
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
new file mode 100644
index 00000000000000..53f533ebb28427
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -0,0 +1,647 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL-GCNTRACKERS %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -verify-misched < %s | FileCheck --check-prefixes=TONGA %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=TONGA-GCNTRACKERS %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched < %s | FileCheck --check-prefixes=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX908-GCNTRACKERS %s
+; RUN: llc -mtriple=amdgcn -verify-misched < %s | FileCheck --check-prefixes=GENERIC %s
+; RUN: llc -mtriple=amdgcn -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GENERIC-GCNTRACKERS %s
+
+; GCN Trackers are sensitive to minor changes in RP, and will avoid scheduling certain instructions, which, if scheduled, 
+; allow scheduling of other instructions which reduce RP
+
+; CHECK-LABEL: {{^}}return_72xi32:
+; GFX11-PAL:    codeLenInByte = 768
+; GFX11-PAL-GCNTRACKERS:    codeLenInByte = 888
+; GFX11-PAL:    NumSgprs: 33
+; GFX11-PAL-GCNTRACKERS:    NumSgprs: 33
+; GFX11-PAL:    NumVgprs: 64
+; GFX11-PAL-GCNTRACKERS:    NumVgprs: 64
+; GFX11-PAL:    ScratchSize: 220
+; GFX11-PAL-GCNTRACKERS:    ScratchSize: 248
+
+
+; CHECK-LABEL: {{^}}call_72xi32:
+; GFX11-PAL:    codeLenInByte = 1300
+; GFX11-PAL-GCNTRACKERS:    codeLenInByte = 1372
+; GFX11-PAL:    NumSgprs: 35
+; GFX11-PAL-GCNTRACKERS:    NumSgprs: 35
+; GFX11-PAL:    NumVgprs: 64
+; GFX11-PAL-GCNTRACKERS:    NumVgprs: 64
+; GFX11-PAL:    ScratchSize: 2780
+; GFX11-PAL-GCNTRACKERS:    ScratchSize: 2808
+
+
+define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
+  ret <72 x i32> %val
+}
+
+define amdgpu_gfx void @call_72xi32() #1 {
+entry:
+  %ret.0 = call amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> zeroinitializer)
+  %val.0 = insertelement <72 x i32> %ret.0, i32 42, i32 0
+  %val.1 = insertelement <72 x i32> %val.0, i32 24, i32 58
+  %ret.1 = call amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val.1)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}global_extload_v16f16_to_v16f64:
+; TONGA:     codeLenInByte = 420
+; TONGA-GCNTRACKERS:     codeLenInByte = 436
+; TONGA:    NumSgprs: 96
+; TONGA-GCNTRACKERS:    NumSgprs: 96
+; TONGA:    NumVgprs: 33
+; TONGA-GCNTRACKERS:    NumVgprs: 25
+; TONGA:    Occupancy: 7
+; TONGA-GCNTRACKERS:    Occupancy: 8
+
+
+define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in)  {
+  %val = load <16 x half>, ptr addrspace(1) %in
+  %cvt = fpext <16 x half> %val to <16 x double>
+  store <16 x double> %cvt, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}constant_zextload_v64i16_to_v64i32:
+; GENERIC:     codeLenInByte = 860
+; GENERIC-GCNTRACKERS:     codeLenInByte = 860
+; GENERIC:    NumSgprs: 71
+; GENERIC-GCNTRACKERS:    NumSgprs: 54
+; GENERIC:    NumVgprs: 16
+; GENERIC-GCNTRACKERS:    NumVgprs: 16
+; GENERIC:    Occupancy: 7
+; GENERIC-GCNTRACKERS:    Occupancy: 8
+
+define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
+  %load = load <64 x i16>, ptr addrspace(4) %in
+  %ext = zext <64 x i16> %load to <64 x i32>
+  store <64 x i32> %ext, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure:
+; GFX908:     codeLenInByte = 1436
+; GFX908-GCNTRACKERS:     codeLenInByte = 1436
+; GFX908:    NumSgprs: 56
+; GFX908-GCNTRACKERS:    NumSgprs: 56
+; GFX908:    NumVgprs: 43
+; GFX908-GCNTRACKERS:    NumVgprs: 39
+; GFX908:    Occupancy: 5
+; GFX908-GCNTRACKERS:    Occupancy: 6
+
+
+define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) {
+entry:
+  %i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i2 = load i64, ptr addrspace(4) %i, align 8
+  %i3 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %i4 = shl i32 %i3, 8
+  %i5 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !5
+  %i6 = add i32 %i4, %i5
+  %i7 = trunc i64 %i2 to i32
+  %conv = add i32 %i6, %i7
+  %conv.frozen = freeze i32 %conv
+  %div = udiv i32 %conv.frozen, 49
+  %add.ptr22 = getelementptr inbounds float, ptr addrspace(4) %wei_ptr, i64 undef
+  %in.ptr1 = getelementptr inbounds float, ptr addrspace(1) %in, i32 %i5
+  br label %for.cond28.preheader
+
+for.cond28.preheader:                             ; preds = %for.cond28.preheader, %entry
+  %accum.sroa.110.0 = phi float [ 0.000000e+00, %entry ], [ %i251, %for.cond28.preheader ]
+  %accum.sroa.106.0 = phi float [ 0.000000e+00, %entry ], [ %i247, %for.cond28.preheader ]
+  %accum.sroa.102.0 = phi float [ 0.000000e+00, %entry ], [ %i243, %for.cond28.preheader ]
+  %accum.sroa.98.0 = phi float [ 0.000000e+00, %entry ], [ %i239, %for.cond28.preheader ]
+  %accum.sroa.94.0 = phi float [ 0.000000e+00, %entry ], [ %i235, %for.cond28.preheader ]
+  %accum.sroa.90.0 = phi float [ 0.000000e+00, %entry ], [ %i231, %for.cond28.preheader ]
+  %accum.sroa.86.0 = phi float [ 0.000000e+00, %entry ], [ %i227, %for.cond28.preheader ]
+  %accum.sroa.82.0 = phi float [ 0.000000e+00, %entry ], [ %i223, %for.cond28.preheader ]
+  %accum.sroa.78.0 = phi float [ 0.000000e+00, %entry ], [ %i219, %for.cond28.preheader ]
+  %accum.sroa.74.0 = phi float [ 0.000000e+00, %entry ], [ %i215, %for.cond28.preheader ]
+  %accum.sroa.70.0 = phi float [ 0.000000e+00, %entry ], [ %i211, %for.cond28.preheader ]
+  %accum.sroa.66.0 = phi float [ 0.000000e+00, %entry ], [ %i207, %for.cond28.preheader ]
+  %accum.sroa.62.0 = phi float [ 0.000000e+00, %entry ], [ %i203, %for.cond28.preheader ]
+  %accum.sroa.58.0 = phi float [ 0.000000e+00, %entry ], [ %i199, %for.cond28.preheader ]
+  %accum.sroa.54.0 = phi float [ 0.000000e+00, %entry ], [ %i195, %for.cond28.preheader ]
+  %accum.sroa.50.0 = phi float [ 0.000000e+00, %entry ], [ %i191, %for.cond28.preheader ]
+  %accum.sroa.46.0 = phi float [ 0.000000e+00, %entry ], [ %i187, %for.cond28.preheader ]
+  %accum.sroa.42.0 = phi float [ 0.000000e+00, %entry ], [ %i183, %for.cond28.preheader ]
+  %accum.sroa.38.0 = phi float [ 0.000000e+00, %entry ], [ %i179, %for.cond28.preheader ]
+  %accum.sroa.34.0 = phi float [ 0.000000e+00, %entry ], [ %i175, %for.cond28.preheader ]
+  %accum.sroa.30.0 = phi float [ 0.000000e+00, %entry ], [ %i171, %for.cond28.preheader ]
+  %accum.sroa.26.0 = phi float [ 0.000000e+00, %entry ], [ %i167, %for.cond28.preheader ]
+  %accum.sroa.22.0 = phi float [ 0.000000e+00, %entry ], [ %i163, %for.cond28.preheader ]
+  %accum.sroa.18.0 = phi float [ 0.000000e+00, %entry ], [ %i159, %for.cond28.preheader ]
+  %accum.sroa.14.0 = phi float [ 0.000000e+00, %entry ], [ %i155, %for.cond28.preheader ]
+  %accum.sroa.10.0 = phi float [ 0.000000e+00, %entry ], [ %i151, %for.cond28.preheader ]
+  %accum.sroa.6.0 = phi float [ 0.000000e+00, %entry ], [ %i147, %for.cond28.preheader ]
+  %accum.sroa.0.0 = phi float [ 0.000000e+00, %entry ], [ %i143, %for.cond28.preheader ]
+  %accum.sroa.114.0 = phi float [ 0.000000e+00, %entry ], [ %i255, %for.cond28.preheader ]
+  %accum.sroa.118.0 = phi float [ 0.000000e+00, %entry ], [ %i259, %for.cond28.preheader ]
+  %accum.sroa.122.0 = phi float [ 0.000000e+00, %entry ], [ %i263, %for.cond28.preheader ]
+  %accum.sroa.126.0 = phi float [ 0.000000e+00, %entry ], [ %i267, %for.cond28.preheader ]
+  %i_ptr.0288 = phi ptr addrspace(1) [ %in.ptr1, %entry ], [ %add.ptr47.3, %for.cond28.preheader ]
+  %w_ptr.0287 = phi ptr addrspace(4) [ %add.ptr22, %entry ], [ %add.ptr74, %for.cond28.preheader ]
+  %ci.0286 = phi i32 [ 0, %entry ], [ %inc116, %for.cond28.preheader ]
+  %i8 = load float, ptr addrspace(1) %i_ptr.0288, align 4
+  %add.ptr47 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 49
+  %i9 = load float, ptr addrspace(1) %add.ptr47, align 4
+  %add.ptr47.1 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 98
+  %i10 = load float, ptr addrspace(1) %add.ptr47.1, align 4
+  %add.ptr47.2 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 147
+  %i11 = load float, ptr addrspace(1) %add.ptr47.2, align 4
+  %i12 = load float, ptr addrspace(4) %w_ptr.0287, align 4
+  %add.ptr66 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1024
+  %i13 = load float, ptr addrspace(4) %add.ptr66, align 4
+  %add.ptr66.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2048
+  %i14 = load float, ptr addrspace(4) %add.ptr66.1, align 4
+  %add.ptr66.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3072
+  %i15 = load float, ptr addrspace(4) %add.ptr66.2, align 4
+  %add.ptr70 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1
+  %i16 = load float, ptr addrspace(4) %add.ptr70, align 4
+  %add.ptr66.1291 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1025
+  %i17 = load float, ptr addrspace(4) %add.ptr66.1291, align 4
+  %add.ptr66.1.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2049
+  %i18 = load float, ptr addrspace(4) %add.ptr66.1.1, align 4
+  %add.ptr66.2.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3073
+  %i19 = load float, ptr addrspace(4) %add.ptr66.2.1, align 4
+  %add.ptr70.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2
+  %i20 = load float, ptr addrspace(4) %add.ptr70.1, align 4
+  %add.ptr66.2293 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1026
+  %i21 = load float, ptr addrspace(4) %add.ptr66.2293, align 4
+  %add.ptr66.1.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2050
+  %i22 = load float, ptr addrspace(4) %add.ptr66.1.2, align 4
+  %add.ptr66.2.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3074
+  %i23 = load float, ptr addrspace(4) %add.ptr66.2.2, align 4
+  %add.ptr70.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3
+  %i24 = load float, ptr addrspace(4) %add.ptr70.2, align 4
+  %add.ptr66.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1027
+  %i25 = load float, ptr addrspace(4) %add.ptr66.3, align 4
+  %add.ptr66.1.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2051
+  %i26 = load float, ptr addrspace(4) %add.ptr66.1.3, align 4
+  %add.ptr66.2.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3075
+  %i27 = load float, ptr addrspace(4) %add.ptr66.2.3, align 4
+  %add.ptr70.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 4
+  %i28 = load float, ptr addrspace(4) %add.ptr70.3, align 4
+  %add.ptr66.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1028
+  %i29 = load float, ptr addrspace(4) %add.ptr66.4, align 4
+  %add.ptr66.1.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2052
+  %i30 = load float, ptr addrspace(4) %add.ptr66.1.4, align 4
+  %add.ptr66.2.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3076
+  %i31 = load float, ptr addrspace(4) %add.ptr66.2.4, align 4
+  %add.ptr70.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 5
+  %i32 = load float, ptr addrspace(4) %add.ptr70.4, align 4
+  %add.ptr66.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1029
+  %i33 = load float, ptr addrspace(4) %add.ptr66.5, align 4
+  %add.ptr66.1.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2053
+  %i34 = load float, ptr addrspace(4) %add.ptr66.1.5, align 4
+  %add.ptr66.2.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3077
+  %i35 = load float, ptr addrspace(4) %add.ptr66.2.5, align 4
+  %add.ptr70.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 6
+  %i36 = load float, ptr addrspace(4) %add.ptr70.5, align 4
+  %add.ptr66.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1030
+  %i37 = load float, ptr addrspace(4) %add.ptr66.6, align 4
+  %add.ptr66.1.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2054
+  %i38 = load float, ptr addrspace(4) %add.ptr66.1.6, align 4
+  %add.ptr66.2.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3078
+  %i39 = load float, ptr addrspace(4) %add.ptr66.2.6, align 4
+  %add.ptr70.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 7
+  %i40 = load float, ptr addrspace(4) %add.ptr70.6, align 4
+  %add.ptr66.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1031
+  %i41 = load float, ptr addrspace(4) %add.ptr66.7, align 4
+  %add.ptr66.1.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2055
+  %i42 = load float, ptr addrspace(4) %add.ptr66.1.7, align 4
+  %add.ptr66.2.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3079
+  %i43 = load float, ptr addrspace(4) %add.ptr66.2.7, align 4
+  %add.ptr70.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 8
+  %i44 = load float, ptr addrspace(4) %add.ptr70.7, align 4
+  %add.ptr66.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1032
+  %i45 = load float, ptr addrspace(4) %add.ptr66.8, align 4
+  %add.ptr66.1.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2056
+  %i46 = load float, ptr addrspace(4) %add.ptr66.1.8, align 4
+  %add.ptr66.2.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3080
+  %i47 = load float, ptr addrspace(4) %add.ptr66.2.8, align 4
+  %add.ptr70.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 9
+  %i48 = load float, ptr addrspace(4) %add.ptr70.8, align 4
+  %add.ptr66.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1033
+  %i49 = load float, ptr addrspace(4) %add.ptr66.9, align 4
+  %add.ptr66.1.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2057
+  %i50 = load float, ptr addrspace(4) %add.ptr66.1.9, align 4
+  %add.ptr66.2.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3081
+  %i51 = load float, ptr addrspace(4) %add.ptr66.2.9, align 4
+  %add.ptr70.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 10
+  %i52 = load float, ptr addrspace(4) %add.ptr70.9, align 4
+  %add.ptr66.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1034
+  %i53 = load float, ptr addrspace(4) %add.ptr66.10, align 4
+  %add.ptr66.1.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2058
+  %i54 = load float, ptr addrspace(4) %add.ptr66.1.10, align 4
+  %add.ptr66.2.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3082
+  %i55 = load float, ptr addrspace(4) %add.ptr66.2.10, align 4
+  %add.ptr70.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 11
+  %i56 = load float, ptr addrspace(4) %add.ptr70.10, align 4
+  %add.ptr66.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1035
+  %i57 = load float, ptr addrspace(4) %add.ptr66.11, align 4
+  %add.ptr66.1.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2059
+  %i58 = load float, ptr addrspace(4) %add.ptr66.1.11, align 4
+  %add.ptr66.2.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3083
+  %i59 = load float, ptr addrspace(4) %add.ptr66.2.11, align 4
+  %add.ptr70.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 12
+  %i60 = load float, ptr addrspace(4) %add.ptr70.11, align 4
+  %add.ptr66.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1036
+  %i61 = load float, ptr addrspace(4) %add.ptr66.12, align 4
+  %add.ptr66.1.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2060
+  %i62 = load float, ptr addrspace(4) %add.ptr66.1.12, align 4
+  %add.ptr66.2.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3084
+  %i63 = load float, ptr addrspace(4) %add.ptr66.2.12, align 4
+  %add.ptr70.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 13
+  %i64 = load float, ptr addrspace(4) %add.ptr70.12, align 4
+  %add.ptr66.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1037
+  %i65 = load float, ptr addrspace(4) %add.ptr66.13, align 4
+  %add.ptr66.1.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2061
+  %i66 = load float, ptr addrspace(4) %add.ptr66.1.13, align 4
+  %add.ptr66.2.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3085
+  %i67 = load float, ptr addrspace(4) %add.ptr66.2.13, align 4
+  %add.ptr70.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 14
+  %i68 = load float, ptr addrspace(4) %add.ptr70.13, align 4
+  %add.ptr66.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1038
+  %i69 = load float, ptr addrspace(4) %add.ptr66.14, align 4
+  %add.ptr66.1.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2062
+  %i70 = load float, ptr addrspace(4) %add.ptr66.1.14, align 4
+  %add.ptr66.2.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3086
+  %i71 = load float, ptr addrspace(4) %add.ptr66.2.14, align 4
+  %add.ptr70.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 15
+  %i72 = load float, ptr addrspace(4) %add.ptr70.14, align 4
+  %add.ptr66.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1039
+  %i73 = load float, ptr addrspace(4) %add.ptr66.15, align 4
+  %add.ptr66.1.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2063
+  %i74 = load float, ptr addrspace(4) %add.ptr66.1.15, align 4
+  %add.ptr66.2.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3087
+  %i75 = load float, ptr addrspace(4) %add.ptr66.2.15, align 4
+  %add.ptr70.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 16
+  %i76 = load float, ptr addrspace(4) %add.ptr70.15, align 4
+  %add.ptr66.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1040
+  %i77 = load float, ptr addrspace(4) %add.ptr66.16, align 4
+  %add.ptr66.1.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2064
+  %i78 = load float, ptr addrspace(4) %add.ptr66.1.16, align 4
+  %add.ptr66.2.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3088
+  %i79 = load float, ptr addrspace(4) %add.ptr66.2.16, align 4
+  %add.ptr70.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 17
+  %i80 = load float, ptr addrspace(4) %add.ptr70.16, align 4
+  %add.ptr66.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1041
+  %i81 = load float, ptr addrspace(4) %add.ptr66.17, align 4
+  %add.ptr66.1.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2065
+  %i82 = load float, ptr addrspace(4) %add.ptr66.1.17, align 4
+  %add.ptr66.2.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3089
+  %i83 = load float, ptr addrspace(4) %add.ptr66.2.17, align 4
+  %add.ptr70.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 18
+  %i84 = load float, ptr addrspace(4) %add.ptr70.17, align 4
+  %add.ptr66.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1042
+  %i85 = load float, ptr addrspace(4) %add.ptr66.18, align 4
+  %add.ptr66.1.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2066
+  %i86 = load float, ptr addrspace(4) %add.ptr66.1.18, align 4
+  %add.ptr66.2.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3090
+  %i87 = load float, ptr addrspace(4) %add.ptr66.2.18, align 4
+  %add.ptr70.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 19
+  %i88 = load float, ptr addrspace(4) %add.ptr70.18, align 4
+  %add.ptr66.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1043
+  %i89 = load float, ptr addrspace(4) %add.ptr66.19, align 4
+  %add.ptr66.1.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2067
+  %i90 = load float, ptr addrspace(4) %add.ptr66.1.19, align 4
+  %add.ptr66.2.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3091
+  %i91 = load float, ptr addrspace(4) %add.ptr66.2.19, align 4
+  %add.ptr70.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 20
+  %i92 = load float, ptr addrspace(4) %add.ptr70.19, align 4
+  %add.ptr66.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1044
+  %i93 = load float, ptr addrspace(4) %add.ptr66.20, align 4
+  %add.ptr66.1.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2068
+  %i94 = load float, ptr addrspace(4) %add.ptr66.1.20, align 4
+  %add.ptr66.2.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3092
+  %i95 = load float, ptr addrspace(4) %add.ptr66.2.20, align 4
+  %add.ptr70.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 21
+  %i96 = load float, ptr addrspace(4) %add.ptr70.20, align 4
+  %add.ptr66.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1045
+  %i97 = load float, ptr addrspace(4) %add.ptr66.21, align 4
+  %add.ptr66.1.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2069
+  %i98 = load float, ptr addrspace(4) %add.ptr66.1.21, align 4
+  %add.ptr66.2.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3093
+  %i99 = load float, ptr addrspace(4) %add.ptr66.2.21, align 4
+  %add.ptr70.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 22
+  %i100 = load float, ptr addrspace(4) %add.ptr70.21, align 4
+  %add.ptr66.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1046
+  %i101 = load float, ptr addrspace(4) %add.ptr66.22, align 4
+  %add.ptr66.1.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2070
+  %i102 = load float, ptr addrspace(4) %add.ptr66.1.22, align 4
+  %add.ptr66.2.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3094
+  %i103 = load float, ptr addrspace(4) %add.ptr66.2.22, align 4
+  %add.ptr70.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 23
+  %i104 = load float, ptr addrspace(4) %add.ptr70.22, align 4
+  %add.ptr66.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1047
+  %i105 = load float, ptr addrspace(4) %add.ptr66.23, align 4
+  %add.ptr66.1.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2071
+  %i106 = load float, ptr addrspace(4) %add.ptr66.1.23, align 4
+  %add.ptr66.2.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3095
+  %i107 = load float, ptr addrspace(4) %add.ptr66.2.23, align 4
+  %add.ptr70.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 24
+  %i108 = load float, ptr addrspace(4) %add.ptr70.23, align 4
+  %add.ptr66.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1048
+  %i109 = load float, ptr addrspace(4) %add.ptr66.24, align 4
+  %add.ptr66.1.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2072
+  %i110 = load float, ptr addrspace(4) %add.ptr66.1.24, align 4
+  %add.ptr66.2.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3096
+  %i111 = load float, ptr addrspace(4) %add.ptr66.2.24, align 4
+  %add.ptr70.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 25
+  %i112 = load float, ptr addrspace(4) %add.ptr70.24, align 4
+  %add.ptr66.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1049
+  %i113 = load float, ptr addrspace(4) %add.ptr66.25, align 4
+  %add.ptr66.1.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2073
+  %i114 = load float, ptr addrspace(4) %add.ptr66.1.25, align 4
+  %add.ptr66.2.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3097
+  %i115 = load float, ptr addrspace(4) %add.ptr66.2.25, align 4
+  %add.ptr70.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 26
+  %i116 = load float, ptr addrspace(4) %add.ptr70.25, align 4
+  %add.ptr66.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1050
+  %i117 = load float, ptr addrspace(4) %add.ptr66.26, align 4
+  %add.ptr66.1.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2074
+  %i118 = load float, ptr addrspace(4) %add.ptr66.1.26, align 4
+  %add.ptr66.2.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3098
+  %i119 = load float, ptr addrspace(4) %add.ptr66.2.26, align 4
+  %add.ptr70.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 27
+  %i120 = load float, ptr addrspace(4) %add.ptr70.26, align 4
+  %add.ptr66.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1051
+  %i121 = load float, ptr addrspace(4) %add.ptr66.27, align 4
+  %add.ptr66.1.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2075
+  %i122 = load float, ptr addrspace(4) %add.ptr66.1.27, align 4
+  %add.ptr66.2.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3099
+  %i123 = load float, ptr addrspace(4) %add.ptr66.2.27, align 4
+  %add.ptr70.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 28
+  %i124 = load float, ptr addrspace(4) %add.ptr70.27, align 4
+  %add.ptr66.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1052
+  %i125 = load float, ptr addrspace(4) %add.ptr66.28, align 4
+  %add.ptr66.1.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2076
+  %i126 = load float, ptr addrspace(4) %add.ptr66.1.28, align 4
+  %add.ptr66.2.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3100
+  %i127 = load float, ptr addrspace(4) %add.ptr66.2.28, align 4
+  %add.ptr70.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 29
+  %i128 = load float, ptr addrspace(4) %add.ptr70.28, align 4
+  %add.ptr66.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1053
+  %i129 = load float, ptr addrspace(4) %add.ptr66.29, align 4
+  %add.ptr66.1.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2077
+  %i130 = load float, ptr addrspace(4) %add.ptr66.1.29, align 4
+  %add.ptr66.2.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3101
+  %i131 = load float, ptr addrspace(4) %add.ptr66.2.29, align 4
+  %add.ptr70.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 30
+  %i132 = load float, ptr addrspace(4) %add.ptr70.29, align 4
+  %add.ptr66.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1054
+  %i133 = load float, ptr addrspace(4) %add.ptr66.30, align 4
+  %add.ptr66.1.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2078
+  %i134 = load float, ptr addrspace(4) %add.ptr66.1.30, align 4
+  %add.ptr66.2.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3102
+  %i135 = load float, ptr addrspace(4) %add.ptr66.2.30, align 4
+  %add.ptr70.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 31
+  %i136 = load float, ptr addrspace(4) %add.ptr70.30, align 4
+  %add.ptr66.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1055
+  %i137 = load float, ptr addrspace(4) %add.ptr66.31, align 4
+  %add.ptr66.1.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2079
+  %i138 = load float, ptr addrspace(4) %add.ptr66.1.31, align 4
+  %add.ptr66.2.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3103
+  %i139 = load float, ptr addrspace(4) %add.ptr66.2.31, align 4
+  %add.ptr47.3 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 196
+  %i140 = tail call float @llvm.fmuladd.f32(float %i8, float %i12, float %accum.sroa.0.0)
+  %i141 = tail call float @llvm.fmuladd.f32(float %i9, float %i13, float %i140)
+  %i142 = tail call float @llvm.fmuladd.f32(float %i10, float %i14, float %i141)
+  %i143 = tail call float @llvm.fmuladd.f32(float %i11, float %i15, float %i142)
+  %i144 = tail call float @llvm.fmuladd.f32(float %i8, float %i16, float %accum.sroa.6.0)
+  %i145 = tail call float @llvm.fmuladd.f32(float %i9, float %i17, float %i144)
+  %i146 = tail call float @llvm.fmuladd.f32(float %i10, float %i18, float %i145)
+  %i147 = tail call float @llvm.fmuladd.f32(float %i11, float %i19, float %i146)
+  %i148 = tail call float @llvm.fmuladd.f32(float %i8, float %i20, float %accum.sroa.10.0)
+  %i149 = tail call float @llvm.fmuladd.f32(float %i9, float %i21, float %i148)
+  %i150 = tail call float @llvm.fmuladd.f32(float %i10, float %i22, float %i149)
+  %i151 = tail call float @llvm.fmuladd.f32(float %i11, float %i23, float %i150)
+  %i152 = tail call float @llvm.fmuladd.f32(float %i8, float %i24, float %accum.sroa.14.0)
+  %i153 = tail call float @llvm.fmuladd.f32(float %i9, float %i25, float %i152)
+  %i154 = tail call float @llvm.fmuladd.f32(float %i10, float %i26, float %i153)
+  %i155 = tail call float @llvm.fmuladd.f32(float %i11, float %i27, float %i154)
+  %i156 = tail call float @llvm.fmuladd.f32(float %i8, float %i28, float %accum.sroa.18.0)
+  %i157 = tail call float @llvm.fmuladd.f32(float %i9, float %i29, float %i156)
+  %i158 = tail call float @llvm.fmuladd.f32(float %i10, float %i30, float %i157)
+  %i159 = tail call float @llvm.fmuladd.f32(float %i11, float %i31, float %i158)
+  %i160 = tail call float @llvm.fmuladd.f32(float %i8, float %i32, float %accum.sroa.22.0)
+  %i161 = tail call float @llvm.fmuladd.f32(float %i9, float %i33, float %i160)
+  %i162 = tail call float @llvm.fmuladd.f32(float %i10, float %i34, float %i161)
+  %i163 = tail call float @llvm.fmuladd.f32(float %i11, float %i35, float %i162)
+  %i164 = tail call float @llvm.fmuladd.f32(float %i8, float %i36, float %accum.sroa.26.0)
+  %i165 = tail call float @llvm.fmuladd.f32(float %i9, float %i37, float %i164)
+  %i166 = tail call float @llvm.fmuladd.f32(float %i10, float %i38, float %i165)
+  %i167 = tail call float @llvm.fmuladd.f32(float %i11, float %i39, float %i166)
+  %i168 = tail call float @llvm.fmuladd.f32(float %i8, float %i40, float %accum.sroa.30.0)
+  %i169 = tail call float @llvm.fmuladd.f32(float %i9, float %i41, float %i168)
+  %i170 = tail call float @llvm.fmuladd.f32(float %i10, float %i42, float %i169)
+  %i171 = tail call float @llvm.fmuladd.f32(float %i11, float %i43, float %i170)
+  %i172 = tail call float @llvm.fmuladd.f32(float %i8, float %i44, float %accum.sroa.34.0)
+  %i173 = tail call float @llvm.fmuladd.f32(float %i9, float %i45, float %i172)
+  %i174 = tail call float @llvm.fmuladd.f32(float %i10, float %i46, float %i173)
+  %i175 = tail call float @llvm.fmuladd.f32(float %i11, float %i47, float %i174)
+  %i176 = tail call float @llvm.fmuladd.f32(float %i8, float %i48, float %accum.sroa.38.0)
+  %i177 = tail call float @llvm.fmuladd.f32(float %i9, float %i49, float %i176)
+  %i178 = tail call float @llvm.fmuladd.f32(float %i10, float %i50, float %i177)
+  %i179 = tail call float @llvm.fmuladd.f32(float %i11, float %i51, float %i178)
+  %i180 = tail call float @llvm.fmuladd.f32(float %i8, float %i52, float %accum.sroa.42.0)
+  %i181 = tail call float @llvm.fmuladd.f32(float %i9, float %i53, float %i180)
+  %i182 = tail call float @llvm.fmuladd.f32(float %i10, float %i54, float %i181)
+  %i183 = tail call float @llvm.fmuladd.f32(float %i11, float %i55, float %i182)
+  %i184 = tail call float @llvm.fmuladd.f32(float %i8, float %i56, float %accum.sroa.46.0)
+  %i185 = tail call float @llvm.fmuladd.f32(float %i9, float %i57, float %i184)
+  %i186 = tail call float @llvm.fmuladd.f32(float %i10, float %i58, float %i185)
+  %i187 = tail call float @llvm.fmuladd.f32(float %i11, float %i59, float %i186)
+  %i188 = tail call float @llvm.fmuladd.f32(float %i8, float %i60, float %accum.sroa.50.0)
+  %i189 = tail call float @llvm.fmuladd.f32(float %i9, float %i61, float %i188)
+  %i190 = tail call float @llvm.fmuladd.f32(float %i10, float %i62, float %i189)
+  %i191 = tail call float @llvm.fmuladd.f32(float %i11, float %i63, float %i190)
+  %i192 = tail call float @llvm.fmuladd.f32(float %i8, float %i64, float %accum.sroa.54.0)
+  %i193 = tail call float @llvm.fmuladd.f32(float %i9, float %i65, float %i192)
+  %i194 = tail call float @llvm.fmuladd.f32(float %i10, float %i66, float %i193)
+  %i195 = tail call float @llvm.fmuladd.f32(float %i11, float %i67, float %i194)
+  %i196 = tail call float @llvm.fmuladd.f32(float %i8, float %i68, float %accum.sroa.58.0)
+  %i197 = tail call float @llvm.fmuladd.f32(float %i9, float %i69, float %i196)
+  %i198 = tail call float @llvm.fmuladd.f32(float %i10, float %i70, float %i197)
+  %i199 = tail call float @llvm.fmuladd.f32(float %i11, float %i71, float %i198)
+  %i200 = tail call float @llvm.fmuladd.f32(float %i8, float %i72, float %accum.sroa.62.0)
+  %i201 = tail call float @llvm.fmuladd.f32(float %i9, float %i73, float %i200)
+  %i202 = tail call float @llvm.fmuladd.f32(float %i10, float %i74, float %i201)
+  %i203 = tail call float @llvm.fmuladd.f32(float %i11, float %i75, float %i202)
+  %i204 = tail call float @llvm.fmuladd.f32(float %i8, float %i76, float %accum.sroa.66.0)
+  %i205 = tail call float @llvm.fmuladd.f32(float %i9, float %i77, float %i204)
+  %i206 = tail call float @llvm.fmuladd.f32(float %i10, float %i78, float %i205)
+  %i207 = tail call float @llvm.fmuladd.f32(float %i11, float %i79, float %i206)
+  %i208 = tail call float @llvm.fmuladd.f32(float %i8, float %i80, float %accum.sroa.70.0)
+  %i209 = tail call float @llvm.fmuladd.f32(float %i9, float %i81, float %i208)
+  %i210 = tail call float @llvm.fmuladd.f32(float %i10, float %i82, float %i209)
+  %i211 = tail call float @llvm.fmuladd.f32(float %i11, float %i83, float %i210)
+  %i212 = tail call float @llvm.fmuladd.f32(float %i8, float %i84, float %accum.sroa.74.0)
+  %i213 = tail call float @llvm.fmuladd.f32(float %i9, float %i85, float %i212)
+  %i214 = tail call float @llvm.fmuladd.f32(float %i10, float %i86, float %i213)
+  %i215 = tail call float @llvm.fmuladd.f32(float %i11, float %i87, float %i214)
+  %i216 = tail call float @llvm.fmuladd.f32(float %i8, float %i88, float %accum.sroa.78.0)
+  %i217 = tail call float @llvm.fmuladd.f32(float %i9, float %i89, float %i216)
+  %i218 = tail call float @llvm.fmuladd.f32(float %i10, float %i90, float %i217)
+  %i219 = tail call float @llvm.fmuladd.f32(float %i11, float %i91, float %i218)
+  %i220 = tail call float @llvm.fmuladd.f32(float %i8, float %i92, float %accum.sroa.82.0)
+  %i221 = tail call float @llvm.fmuladd.f32(float %i9, float %i93, float %i220)
+  %i222 = tail call float @llvm.fmuladd.f32(float %i10, float %i94, float %i221)
+  %i223 = tail call float @llvm.fmuladd.f32(float %i11, float %i95, float %i222)
+  %i224 = tail call float @llvm.fmuladd.f32(float %i8, float %i96, float %accum.sroa.86.0)
+  %i225 = tail call float @llvm.fmuladd.f32(float %i9, float %i97, float %i224)
+  %i226 = tail call float @llvm.fmuladd.f32(float %i10, float %i98, float %i225)
+  %i227 = tail call float @llvm.fmuladd.f32(float %i11, float %i99, float %i226)
+  %i228 = tail call float @llvm.fmuladd.f32(float %i8, float %i100, float %accum.sroa.90.0)
+  %i229 = tail call float @llvm.fmuladd.f32(float %i9, float %i101, float %i228)
+  %i230 = tail call float @llvm.fmuladd.f32(float %i10, float %i102, float %i229)
+  %i231 = tail call float @llvm.fmuladd.f32(float %i11, float %i103, float %i230)
+  %i232 = tail call float @llvm.fmuladd.f32(float %i8, float %i104, float %accum.sroa.94.0)
+  %i233 = tail call float @llvm.fmuladd.f32(float %i9, float %i105, float %i232)
+  %i234 = tail call float @llvm.fmuladd.f32(float %i10, float %i106, float %i233)
+  %i235 = tail call float @llvm.fmuladd.f32(float %i11, float %i107, float %i234)
+  %i236 = tail call float @llvm.fmuladd.f32(float %i8, float %i108, float %accum.sroa.98.0)
+  %i237 = tail call float @llvm.fmuladd.f32(float %i9, float %i109, float %i236)
+  %i238 = tail call float @llvm.fmuladd.f32(float %i10, float %i110, float %i237)
+  %i239 = tail call float @llvm.fmuladd.f32(float %i11, float %i111, float %i238)
+  %i240 = tail call float @llvm.fmuladd.f32(float %i8, float %i112, float %accum.sroa.102.0)
+  %i241 = tail call float @llvm.fmuladd.f32(float %i9, float %i113, float %i240)
+  %i242 = tail call float @llvm.fmuladd.f32(float %i10, float %i114, float %i241)
+  %i243 = tail call float @llvm.fmuladd.f32(float %i11, float %i115, float %i242)
+  %i244 = tail call float @llvm.fmuladd.f32(float %i8, float %i116, float %accum.sroa.106.0)
+  %i245 = tail call float @llvm.fmuladd.f32(float %i9, float %i117, float %i244)
+  %i246 = tail call float @llvm.fmuladd.f32(float %i10, float %i118, float %i245)
+  %i247 = tail call float @llvm.fmuladd.f32(float %i11, float %i119, float %i246)
+  %i248 = tail call float @llvm.fmuladd.f32(float %i8, float %i120, float %accum.sroa.110.0)
+  %i249 = tail call float @llvm.fmuladd.f32(float %i9, float %i121, float %i248)
+  %i250 = tail call float @llvm.fmuladd.f32(float %i10, float %i122, float %i249)
+  %i251 = tail call float @llvm.fmuladd.f32(float %i11, float %i123, float %i250)
+  %i252 = tail call float @llvm.fmuladd.f32(float %i8, float %i124, float %accum.sroa.114.0)
+  %i253 = tail call float @llvm.fmuladd.f32(float %i9, float %i125, float %i252)
+  %i254 = tail call float @llvm.fmuladd.f32(float %i10, float %i126, float %i253)
+  %i255 = tail call float @llvm.fmuladd.f32(float %i11, float %i127, float %i254)
+  %i256 = tail call float @llvm.fmuladd.f32(float %i8, float %i128, float %accum.sroa.118.0)
+  %i257 = tail call float @llvm.fmuladd.f32(float %i9, float %i129, float %i256)
+  %i258 = tail call float @llvm.fmuladd.f32(float %i10, float %i130, float %i257)
+  %i259 = tail call float @llvm.fmuladd.f32(float %i11, float %i131, float %i258)
+  %i260 = tail call float @llvm.fmuladd.f32(float %i8, float %i132, float %accum.sroa.122.0)
+  %i261 = tail call float @llvm.fmuladd.f32(float %i9, float %i133, float %i260)
+  %i262 = tail call float @llvm.fmuladd.f32(float %i10, float %i134, float %i261)
+  %i263 = tail call float @llvm.fmuladd.f32(float %i11, float %i135, float %i262)
+  %i264 = tail call float @llvm.fmuladd.f32(float %i8, float %i136, float %accum.sroa.126.0)
+  %i265 = tail call float @llvm.fmuladd.f32(float %i9, float %i137, float %i264)
+  %i266 = tail call float @llvm.fmuladd.f32(float %i10, float %i138, float %i265)
+  %i267 = tail call float @llvm.fmuladd.f32(float %i11, float %i139, float %i266)
+  %add.ptr74 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 4096
+  %inc116 = add nuw nsw i32 %ci.0286, 1
+  %exitcond.not = icmp eq i32 %inc116, 512
+  br i1 %exitcond.not, label %for.cond.cleanup26, label %for.cond28.preheader
+
+for.cond.cleanup26:                               ; preds = %for.cond28.preheader
+  %mul119 = shl nuw nsw i32 undef, 1
+  %mul120 = mul i32 %div, 200704
+  %mul121 = mul i32 undef, 6272
+  %add122 = add i32 %mul120, %mul121
+  %mul123 = mul nuw nsw i32 undef, 28
+  %add124 = add i32 %add122, %mul123
+  %add126 = add i32 %add124, %mul119
+  %idx.ext127 = zext i32 %add126 to i64
+  %add.ptr128 = getelementptr inbounds float, ptr addrspace(1) %out_ptr, i64 %idx.ext127
+  store float %i143, ptr addrspace(1) %add.ptr128, align 4
+  %add.ptr184 = getelementptr inbounds float, ptr addrspace(1) %add.ptr128, i64 196
+  store float %i147, ptr addrspace(1) %add.ptr184, align 4
+  %add.ptr167.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184, i64 14
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr167.1, align 4
+  %add.ptr175.1.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr167.1, i64 1
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr175.1.1, align 4
+  %add.ptr184.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184, i64 196
+  store float %i151, ptr addrspace(1) %add.ptr184.1, align 4
+  %add.ptr184.2 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.1, i64 196
+  store float %i155, ptr addrspace(1) %add.ptr184.2, align 4
+  %add.ptr184.3 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.2, i64 196
+  store float %i159, ptr addrspace(1) %add.ptr184.3, align 4
+  %add.ptr184.4 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.3, i64 196
+  store float %i163, ptr addrspace(1) %add.ptr184.4, align 4
+  %add.ptr154.5 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.4, i64 1
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr154.5, align 4
+  %add.ptr184.5 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.4, i64 196
+  store float %i167, ptr addrspace(1) %add.ptr184.5, align 4
+  %add.ptr154.6 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.5, i64 1
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr154.6, align 4
+  %add.ptr184.6 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.5, i64 196
+  store float %i171, ptr addrspace(1) %add.ptr184.6, align 4
+  %add.ptr184.7 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.6, i64 196
+  store float %i175, ptr addrspace(1) %add.ptr184.7, align 4
+  %add.ptr167.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.7, i64 14
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr167.8, align 4
+  %add.ptr175.1.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr167.8, i64 1
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr175.1.8, align 4
+  %add.ptr184.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.7, i64 196
+  store float %i179, ptr addrspace(1) %add.ptr184.8, align 4
+  %add.ptr184.9 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.8, i64 196
+  store float %i183, ptr addrspace(1) %add.ptr184.9, align 4
+  %add.ptr184.10 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.9, i64 196
+  store float %i187, ptr addrspace(1) %add.ptr184.10, align 4
+  %add.ptr184.11 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.10, i64 196
+  store float %i191, ptr addrspace(1) %add.ptr184.11, align 4
+  %add.ptr184.12 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.11, i64 196
+  store float %i195, ptr addrspace(1) %add.ptr184.12, align 4
+  %add.ptr184.13 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.12, i64 196
+  store float %i199, ptr addrspace(1) %add.ptr184.13, align 4
+  %add.ptr184.14 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.13, i64 196
+  store float %i203, ptr addrspace(1) %add.ptr184.14, align 4
+  %add.ptr184.15 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.14, i64 196
+  store float %i207, ptr addrspace(1) %add.ptr184.15, align 4
+  %add.ptr184.16 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.15, i64 196
+  store float %i211, ptr addrspace(1) %add.ptr184.16, align 4
+  %add.ptr184.17 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.16, i64 196
+  store float %i215, ptr addrspace(1) %add.ptr184.17, align 4
+  %add.ptr184.18 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.17, i64 196
+  store float %i219, ptr addrspace(1) %add.ptr184.18, align 4
+  %add.ptr184.19 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.18, i64 196
+  store float %i223, ptr addrspace(1) %add.ptr184.19, align 4
+  %add.ptr184.20 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.19, i64 196
+  store float %i227, ptr addrspace(1) %add.ptr184.20, align 4
+  %add.ptr184.21 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.20, i64 196
+  store float %i231, ptr addrspace(1) %add.ptr184.21, align 4
+  %add.ptr184.22 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.21, i64 196
+  store float %i235, ptr addrspace(1) %add.ptr184.22, align 4
+  %add.ptr184.23 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.22, i64 196
+  store float %i239, ptr addrspace(1) %add.ptr184.23, align 4
+  %add.ptr184.24 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.23, i64 196
+  store float %i243, ptr addrspace(1) %add.ptr184.24, align 4
+  %add.ptr184.25 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.24, i64 196
+  store float %i247, ptr addrspace(1) %add.ptr184.25, align 4
+  %add.ptr184.26 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.25, i64 196
+  store float %i251, ptr addrspace(1) %add.ptr184.26, align 4
+  %add.ptr184.27 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.26, i64 196
+  store float %i255, ptr addrspace(1) %add.ptr184.27, align 4
+  %add.ptr184.28 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.27, i64 196
+  store float %i259, ptr addrspace(1) %add.ptr184.28, align 4
+  %add.ptr184.29 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.28, i64 196
+  store float %i263, ptr addrspace(1) %add.ptr184.29, align 4
+  %add.ptr184.30 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.29, i64 196
+  store float %i267, ptr addrspace(1) %add.ptr184.30, align 4
+  ret void
+}
+
+
+
+declare float @llvm.fmuladd.f32(float, float, float) #2
+declare i32 @llvm.amdgcn.workitem.id.x() #3
+declare i32 @llvm.amdgcn.workgroup.id.x() #3
+declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3
+
+!0 = !{i32 1, i32 2, i32 1, i32 0}
+!1 = !{!"none", !"none", !"none", !"none"}
+!2 = !{!"ptr", !"ptr", !"ptr", !"float"}
+!3 = !{!"restrict const", !"restrict const", !"restrict", !""}
+!4 = !{i32 256, i32 1, i32 1}
+!5 = !{i32 0, i32 1024}
+
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1" }
+attributes #1 = { nounwind "amdgpu-num-vgpr"="64" }
+attributes #2 = { nofree nosync nounwind readnone speculatable willreturn }
+attributes #3 = { nounwind readnone speculatable willreturn }
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
index 14bb4310c619ea..34d203e0de2ffa 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
@@ -1,4 +1,5 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
 
 --- |
   define amdgpu_kernel void @no_sched_metric_due_to_spills() #0 {
@@ -11,6 +12,20 @@
 # GCN-LABEL: name: no_sched_metric_due_to_spills
 # GCN-NOT: SI_SPILL_
 # GCN: S_ENDPGM
+
+# GCN-GCNTRACKER-LABEL: name: no_sched_metric_due_to_spills
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: S_ENDPGM
+
+# When using the GCN Trackers, the scheduler is able to acieve desired occupancy without running high-RP-reschedule stage. However, the RP is still high,
+# and RA is unable to allocate without spills. By running the high-RP-reschedule schedule we would have furhter decreased RP, which provides increased 
+# flexibility for RA.
+
 ---
 name:            no_sched_metric_due_to_spills
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
index 94815558bf3d6d..71f8d91874f04f 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
@@ -1,16 +1,24 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs  < %s | FileCheck --check-prefix=OCC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs  < %s | FileCheck --check-prefix=OCC-GCNTRACKER %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs -amdgpu-schedule-relaxed-occupancy=true  < %s | FileCheck --check-prefix=RELAX %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -amdgpu-schedule-relaxed-occupancy=true  < %s | FileCheck --check-prefix=RELAX-GCNTRACKER %s
 
 
 ; Using -amgpu-schedule-relaxed-occupancy allows scheduler to produce better ILP by further relaxing occupancy target
 
-; GCN-LABEL: {{^}}load_fma_store:
+; CHECK-LABEL: {{^}}load_fma_store:
 ; OCC:    NumVgprs: 32
+; OCC-GCNTRACKER:    NumVgprs: 24
 ; RELAX:    NumVgprs: 64
+; RELAX-GCNTRACKER:    NumVgprs: 60
 ; OCC: NumVGPRsForWavesPerEU: 32
+; OCC-GCNTRACKER: NumVGPRsForWavesPerEU: 24
 ; RELAX: NumVGPRsForWavesPerEU: 64
+; RELAX-GCNTRACKER: NumVGPRsForWavesPerEU: 60
 ; OCC:    Occupancy: 8
+; OCC-GCNTRACKER:    Occupancy: 8
 ; RELAX: Occupancy: 4
+; RELAX-GCNTRACKER: Occupancy: 4
 
 define amdgpu_kernel void @load_fma_store(ptr addrspace(3) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) #1 {
 bb:



More information about the llvm-commits mailing list