[llvm] [AMDGPU] Optionally Use GCNRPTrackers during scheduling (PR #93090)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 9 09:43:52 PDT 2024
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/93090
>From 276193f872ceb2e2fa82e7a34b7c4a53ec42e55c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 21 May 2024 12:55:07 -0700
Subject: [PATCH] [AMDGPU] Optionally Use AMDGPU RPTrackers during scheduling
---
.../Target/AMDGPU/GCNIterativeScheduler.cpp | 2 +-
llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 199 +++++-
llvm/lib/Target/AMDGPU/GCNRegPressure.h | 96 ++-
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 148 +++-
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 60 +-
.../CodeGen/AMDGPU/high-RP-reschedule.mir | 10 +-
llvm/test/CodeGen/AMDGPU/pr51516.mir | 6 +-
.../schedule-amdgpu-tracker-physreg-crash.ll | 65 ++
.../AMDGPU/schedule-amdgpu-tracker-physreg.ll | 491 +++++++++++++
.../AMDGPU/schedule-amdgpu-trackers.ll | 647 ++++++++++++++++++
...schedule-regpressure-ilp-metric-spills.mir | 17 +-
.../AMDGPU/schedule-relaxed-occupancy.ll | 10 +-
12 files changed, 1672 insertions(+), 79 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 13504508e2fb2e..da065e8d8cb6b8 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -480,7 +480,7 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
"target occupancy = "
<< TgtOcc << '\n');
- GCNMaxOccupancySchedStrategy LStrgy(Context);
+ GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler=*/true);
unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
for (int I = 0; I < NumPasses; ++I) {
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 7c633b2bce7bc2..d46c4cf23a221e 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -296,6 +296,63 @@ collectVirtualRegUses(SmallVectorImpl<RegisterMaskPair> &RegMaskPairs,
}
}
+/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
+static LaneBitmask getLanesWithProperty(
+ const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
+ bool TrackLaneMasks, Register RegUnit, SlotIndex Pos,
+ LaneBitmask SafeDefault,
+ function_ref<bool(const LiveRange &LR, SlotIndex Pos)> Property) {
+ if (RegUnit.isVirtual()) {
+ const LiveInterval &LI = LIS.getInterval(RegUnit);
+ LaneBitmask Result;
+ if (TrackLaneMasks && LI.hasSubRanges()) {
+ for (const LiveInterval::SubRange &SR : LI.subranges()) {
+ if (Property(SR, Pos))
+ Result |= SR.LaneMask;
+ }
+ } else if (Property(LI, Pos)) {
+ Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit)
+ : LaneBitmask::getAll();
+ }
+
+ return Result;
+ }
+
+ const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
+ if (LR == nullptr)
+ return SafeDefault;
+ return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone();
+}
+
+/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
+/// Helper to find a vreg use between two indices {PriorUseIdx, NextUseIdx}.
+/// The query starts with a lane bitmask which gets lanes/bits removed for every
+/// use we find.
+static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
+ SlotIndex PriorUseIdx, SlotIndex NextUseIdx,
+ const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *TRI,
+ const LiveIntervals *LIS,
+ bool Upward = false) {
+ for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+ if (MO.isUndef())
+ continue;
+ const MachineInstr *MI = MO.getParent();
+ SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
+ bool InRange = Upward ? (InstSlot > PriorUseIdx && InstSlot <= NextUseIdx)
+ : (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx);
+ if (!InRange)
+ continue;
+
+ unsigned SubRegIdx = MO.getSubReg();
+ LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(SubRegIdx);
+ LastUseMask &= ~UseMask;
+ if (LastUseMask.none())
+ return LaneBitmask::getNone();
+ }
+ return LastUseMask;
+}
+
///////////////////////////////////////////////////////////////////////////////
// GCNRPTracker
@@ -354,17 +411,28 @@ void GCNRPTracker::reset(const MachineInstr &MI,
MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
}
-////////////////////////////////////////////////////////////////////////////////
-// GCNUpwardRPTracker
-
-void GCNUpwardRPTracker::reset(const MachineRegisterInfo &MRI_,
- const LiveRegSet &LiveRegs_) {
+void GCNRPTracker::reset(const MachineRegisterInfo &MRI_,
+ const LiveRegSet &LiveRegs_) {
MRI = &MRI_;
LiveRegs = LiveRegs_;
LastTrackedMI = nullptr;
MaxPressure = CurPressure = getRegPressure(MRI_, LiveRegs_);
}
+/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
+LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit,
+ SlotIndex Pos) const {
+ return getLanesWithProperty(
+ LIS, *MRI, true, RegUnit, Pos.getBaseIndex(), LaneBitmask::getNone(),
+ [](const LiveRange &LR, SlotIndex Pos) {
+ const LiveRange::Segment *S = LR.getSegmentContaining(Pos);
+ return S != nullptr && S->end == Pos.getRegSlot();
+ });
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// GCNUpwardRPTracker
+
void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
assert(MRI && "call reset first");
@@ -441,25 +509,37 @@ bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
return true;
}
-bool GCNDownwardRPTracker::advanceBeforeNext() {
+bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI,
+ bool UseInternalIterator) {
assert(MRI && "call reset first");
- if (!LastTrackedMI)
- return NextMI == MBBEnd;
-
- assert(NextMI == MBBEnd || !NextMI->isDebugInstr());
+ SlotIndex SI;
+ const MachineInstr *CurrMI;
+ if (UseInternalIterator) {
+ if (!LastTrackedMI)
+ return NextMI == MBBEnd;
+
+ assert(NextMI == MBBEnd || !NextMI->isDebugInstr());
+ CurrMI = LastTrackedMI;
+
+ SI = NextMI == MBBEnd
+ ? LIS.getInstructionIndex(*LastTrackedMI).getDeadSlot()
+ : LIS.getInstructionIndex(*NextMI).getBaseIndex();
+ } else { //! UseInternalIterator
+ SI = LIS.getInstructionIndex(*MI).getBaseIndex();
+ CurrMI = MI;
+ }
- SlotIndex SI = NextMI == MBBEnd
- ? LIS.getInstructionIndex(*LastTrackedMI).getDeadSlot()
- : LIS.getInstructionIndex(*NextMI).getBaseIndex();
assert(SI.isValid());
// Remove dead registers or mask bits.
SmallSet<Register, 8> SeenRegs;
- for (auto &MO : LastTrackedMI->operands()) {
+ for (auto &MO : CurrMI->operands()) {
if (!MO.isReg() || !MO.getReg().isVirtual())
continue;
if (MO.isUse() && !MO.readsReg())
continue;
+ if (!UseInternalIterator && MO.isDef())
+ continue;
if (!SeenRegs.insert(MO.getReg()).second)
continue;
const LiveInterval &LI = LIS.getInterval(MO.getReg());
@@ -492,15 +572,22 @@ bool GCNDownwardRPTracker::advanceBeforeNext() {
LastTrackedMI = nullptr;
- return NextMI == MBBEnd;
+ return UseInternalIterator && (NextMI == MBBEnd);
}
-void GCNDownwardRPTracker::advanceToNext() {
- LastTrackedMI = &*NextMI++;
- NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+void GCNDownwardRPTracker::advanceToNext(MachineInstr *MI,
+ bool UseInternalIterator) {
+ if (UseInternalIterator) {
+ LastTrackedMI = &*NextMI++;
+ NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+ } else {
+ LastTrackedMI = MI;
+ }
+
+ const MachineInstr *CurrMI = LastTrackedMI;
// Add new registers or mask bits.
- for (const auto &MO : LastTrackedMI->all_defs()) {
+ for (const auto &MO : CurrMI->all_defs()) {
Register Reg = MO.getReg();
if (!Reg.isVirtual())
continue;
@@ -513,11 +600,16 @@ void GCNDownwardRPTracker::advanceToNext() {
MaxPressure = max(MaxPressure, CurPressure);
}
-bool GCNDownwardRPTracker::advance() {
- if (NextMI == MBBEnd)
+bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator) {
+ if (UseInternalIterator && NextMI == MBBEnd)
return false;
- advanceBeforeNext();
- advanceToNext();
+
+ advanceBeforeNext(MI, UseInternalIterator);
+ advanceToNext(MI, UseInternalIterator);
+ if (!UseInternalIterator) {
+ // We must remove any dead def lanes from the current RP
+ advanceBeforeNext(MI, true);
+ }
return true;
}
@@ -559,6 +651,67 @@ Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
});
}
+GCNRegPressure
+GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
+ const SIRegisterInfo *TRI) const {
+ assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
+
+ SlotIndex SlotIdx;
+ SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot();
+
+ // Account for register pressure similar to RegPressureTracker::recede().
+ RegisterOperands RegOpers;
+ RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/false);
+ RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
+ GCNRegPressure TempPressure = CurPressure;
+
+ for (const RegisterMaskPair &Use : RegOpers.Uses) {
+ Register Reg = Use.RegUnit;
+ if (!Reg.isVirtual())
+ continue;
+ LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
+ if (LastUseMask.none())
+ continue;
+ // The LastUseMask is queried from the liveness information of instruction
+ // which may be further down the schedule. Some lanes may actually not be
+ // last uses for the current position.
+ // FIXME: allow the caller to pass in the list of vreg uses that remain
+ // to be bottom-scheduled to avoid searching uses at each query.
+ SlotIndex CurrIdx;
+ const MachineBasicBlock *MBB = MI->getParent();
+ MachineBasicBlock::const_iterator IdxPos = skipDebugInstructionsForward(
+ LastTrackedMI ? LastTrackedMI : MBB->begin(), MBB->end());
+ if (IdxPos == MBB->end()) {
+ CurrIdx = LIS.getMBBEndIdx(MBB);
+ } else {
+ CurrIdx = LIS.getInstructionIndex(*IdxPos).getRegSlot();
+ }
+
+ LastUseMask =
+ findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, TRI, &LIS);
+ if (LastUseMask.none())
+ continue;
+
+ LaneBitmask LiveMask =
+ LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0);
+ LaneBitmask NewMask = LiveMask & ~LastUseMask;
+ TempPressure.inc(Reg, LiveMask, NewMask, *MRI);
+ }
+
+ // Generate liveness for defs.
+ for (const RegisterMaskPair &Def : RegOpers.Defs) {
+ Register Reg = Def.RegUnit;
+ if (!Reg.isVirtual())
+ continue;
+ LaneBitmask LiveMask =
+ LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0);
+ LaneBitmask NewMask = LiveMask | Def.LaneMask;
+ TempPressure.inc(Reg, LiveMask, NewMask, *MRI);
+ }
+
+ return TempPressure;
+}
+
bool GCNUpwardRPTracker::isValid() const {
const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex();
const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index d419fcc802c60a..06c3d9027db1b5 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -19,6 +19,7 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/RegisterPressure.h"
#include <algorithm>
namespace llvm {
@@ -149,6 +150,9 @@ inline GCNRegPressure operator-(const GCNRegPressure &P1,
return Diff;
}
+///////////////////////////////////////////////////////////////////////////////
+// GCNRPTracker
+
class GCNRPTracker {
public:
using LiveRegSet = DenseMap<unsigned, LaneBitmask>;
@@ -165,7 +169,14 @@ class GCNRPTracker {
void reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy,
bool After);
+ /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
+ void bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs);
+
+ LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const;
+
public:
+ // reset tracker and set live register set to the specified value.
+ void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_);
// live regs for the current state
const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; }
@@ -182,34 +193,38 @@ class GCNRPTracker {
GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
const MachineRegisterInfo &MRI);
+////////////////////////////////////////////////////////////////////////////////
+// GCNUpwardRPTracker
+
class GCNUpwardRPTracker : public GCNRPTracker {
public:
GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
- // reset tracker and set live register set to the specified value.
- void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_);
+ using GCNRPTracker::reset;
- // reset tracker at the specified slot index.
+ /// reset tracker at the specified slot index \p SI.
void reset(const MachineRegisterInfo &MRI, SlotIndex SI) {
- reset(MRI, llvm::getLiveRegs(SI, LIS, MRI));
+ GCNRPTracker::reset(MRI, llvm::getLiveRegs(SI, LIS, MRI));
}
- // reset tracker to the end of the MBB.
+ /// reset tracker to the end of the \p MBB.
void reset(const MachineBasicBlock &MBB) {
reset(MBB.getParent()->getRegInfo(),
LIS.getSlotIndexes()->getMBBEndIdx(&MBB));
}
- // reset tracker to the point just after MI (in program order).
+ /// reset tracker to the point just after \p MI (in program order).
void reset(const MachineInstr &MI) {
reset(MI.getMF()->getRegInfo(), LIS.getInstructionIndex(MI).getDeadSlot());
}
- // move to the state just before the MI (in program order).
+ /// Move to the state of RP just before the \p MI . If \p UseInternalIterator
+ /// is set, also update the internal iterators. Setting \p UseInternalIterator
+ /// to false allows for an externally managed iterator / program order.
void recede(const MachineInstr &MI);
- // checks whether the tracker's state after receding MI corresponds
- // to reported by LIS.
+ /// \p returns whether the tracker's state after receding MI corresponds
+ /// to reported by LIS.
bool isValid() const;
const GCNRegPressure &getMaxPressure() const { return MaxPressure; }
@@ -223,6 +238,9 @@ class GCNUpwardRPTracker : public GCNRPTracker {
}
};
+////////////////////////////////////////////////////////////////////////////////
+// GCNDownwardRPTracker
+
class GCNDownwardRPTracker : public GCNRPTracker {
// Last position of reset or advanceBeforeNext
MachineBasicBlock::const_iterator NextMI;
@@ -232,37 +250,65 @@ class GCNDownwardRPTracker : public GCNRPTracker {
public:
GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
+ using GCNRPTracker::reset;
+
MachineBasicBlock::const_iterator getNext() const { return NextMI; }
- // Return MaxPressure and clear it.
+ /// \p return MaxPressure and clear it.
GCNRegPressure moveMaxPressure() {
auto Res = MaxPressure;
MaxPressure.clear();
return Res;
}
- // Reset tracker to the point before the MI
- // filling live regs upon this point using LIS.
- // Returns false if block is empty except debug values.
+ /// Reset tracker to the point before the \p MI
+ /// filling \p LiveRegs upon this point using LIS.
+ /// \p returns false if block is empty except debug values.
bool reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
- // Move to the state right before the next MI or after the end of MBB.
- // Returns false if reached end of the block.
- bool advanceBeforeNext();
-
- // Move to the state at the MI, advanceBeforeNext has to be called first.
- void advanceToNext();
-
- // Move to the state at the next MI. Returns false if reached end of block.
- bool advance();
-
- // Advance instructions until before End.
+ /// Move to the state right before the next MI or after the end of MBB.
+ /// \p returns false if reached end of the block.
+ /// If \p UseInternalIterator is true, then internal iterators are used and
+ /// set to process in program order. If \p UseInternalIterator is false, then
+ /// it is assumed that the tracker is using an externally managed iterator,
+ /// and advance* calls will not update the state of the iterator. In such
+ /// cases, the tracker will move to the state right before the provided \p MI
+ /// and use LIS for RP calculations.
+ bool advanceBeforeNext(MachineInstr *MI = nullptr,
+ bool UseInternalIterator = true);
+
+ /// Move to the state at the MI, advanceBeforeNext has to be called first.
+ /// If \p UseInternalIterator is true, then internal iterators are used and
+ /// set to process in program order. If \p UseInternalIterator is false, then
+ /// it is assumed that the tracker is using an externally managed iterator,
+ /// and advance* calls will not update the state of the iterator. In such
+ /// cases, the tracker will move to the state at the provided \p MI .
+ void advanceToNext(MachineInstr *MI = nullptr,
+ bool UseInternalIterator = true);
+
+ /// Move to the state at the next MI. \p returns false if reached end of
+ /// block. If \p UseInternalIterator is true, then internal iterators are used
+ /// and set to process in program order. If \p UseInternalIterator is false,
+ /// then it is assumed that the tracker is using an externally managed
+ /// iterator, and advance* calls will not update the state of the iterator. In
+ /// such cases, the tracker will move to the state right before the provided
+ /// \p MI and use LIS for RP calculations.
+ bool advance(MachineInstr *MI = nullptr, bool UseInternalIterator = true);
+
+ /// Advance instructions until before \p End.
bool advance(MachineBasicBlock::const_iterator End);
- // Reset to Begin and advance to End.
+ /// Reset to \p Begin and advance to \p End.
bool advance(MachineBasicBlock::const_iterator Begin,
MachineBasicBlock::const_iterator End,
const LiveRegSet *LiveRegsCopy = nullptr);
+
+ /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
+ /// Calculate the impact \p MI will have on CurPressure and \return the
+ /// speculated pressure. In order to support RP Speculation, this does not
+ /// rely on the implicit program ordering in the LiveIntervals.
+ GCNRegPressure bumpDownwardPressure(const MachineInstr *MI,
+ const SIRegisterInfo *TRI) const;
};
/// \returns the LaneMask of live lanes of \p Reg at position \p SI. Only the
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index d6958d9055fade..11c95675aeeafa 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -58,11 +58,17 @@ static cl::opt<bool>
"Wave Limited (amdgpu-limit-wave-threshold)."),
cl::init(false));
+static cl::opt<bool> GCNTrackers(
+ "amdgpu-use-amdgpu-trackers", cl::Hidden,
+ cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
+ cl::init(false));
+
const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
: GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
- HasHighPressure(false) {}
+ DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
+}
void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG);
@@ -148,17 +154,38 @@ static bool canUsePressureDiffs(const SUnit &SU) {
return true;
}
-static void getRegisterPressures(bool AtTop,
- const RegPressureTracker &RPTracker, SUnit *SU,
- std::vector<unsigned> &Pressure,
- std::vector<unsigned> &MaxPressure) {
+static void getRegisterPressures(
+ bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
+ std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
+ GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
+ ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
// getDownwardPressure() and getUpwardPressure() make temporary changes to
// the tracker, so we need to pass those function a non-const copy.
RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
- if (AtTop)
- TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
- else
- TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+ if (!GCNTrackers) {
+ AtTop
+ ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
+ : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+
+ return;
+ }
+
+ // GCNTrackers
+ Pressure.resize(4, 0);
+ MachineInstr *MI = SU->getInstr();
+ GCNRegPressure NewPressure;
+ if (AtTop) {
+ GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
+ NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
+ } else {
+ GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
+ TempUpwardTracker.recede(*MI);
+ NewPressure = TempUpwardTracker.getPressure();
+ }
+ Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
+ Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
+ NewPressure.getArchVGPRNum();
+ Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
}
void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
@@ -187,8 +214,9 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
//
// In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
// PressureDiffs.
- if (AtTop || !canUsePressureDiffs(*SU)) {
- getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure);
+ if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
+ getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
+ DownwardTracker, UpwardTracker, DAG, SRI);
} else {
// Reserve 4 slots.
Pressure.resize(4, 0);
@@ -206,7 +234,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
#ifdef EXPENSIVE_CHECKS
std::vector<unsigned> CheckPressure, CheckMaxPressure;
- getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure);
+ getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
+ TheTracker, UpwardTracker, DAG, SRI);
if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
@@ -294,8 +323,16 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
unsigned SGPRPressure = 0;
unsigned VGPRPressure = 0;
if (DAG->isTrackingPressure()) {
- SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
- VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+ if (!GCNTrackers) {
+ SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
+ VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+ } else {
+ GCNRPTracker *T = IsBottomUp
+ ? static_cast<GCNRPTracker *>(&UpwardTracker)
+ : static_cast<GCNRPTracker *>(&DownwardTracker);
+ SGPRPressure = T->getPressure().getSGPRNum();
+ VGPRPressure = T->getPressure().getArchVGPRNum();
+ }
}
ReadyQueue &Q = Zone.Available;
for (SUnit *SU : Q) {
@@ -444,6 +481,16 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
return SU;
}
+void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+ if (GCNTrackers) {
+ MachineInstr *MI = SU->getInstr();
+ IsTopNode ? (void)DownwardTracker.advance(MI, false)
+ : UpwardTracker.recede(*MI);
+ }
+
+ return GenericScheduler::schedNode(SU, IsTopNode);
+}
+
GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
assert(CurrentStage && CurrentStage != SchedStages.end());
return *CurrentStage;
@@ -470,12 +517,13 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const {
}
GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
- const MachineSchedContext *C)
+ const MachineSchedContext *C, bool IsLegacyScheduler)
: GCNSchedStrategy(C) {
SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
+ GCNTrackers = GCNTrackers & !IsLegacyScheduler;
}
GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
@@ -571,7 +619,8 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(
MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
: ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
MFI(*MF.getInfo<SIMachineFunctionInfo>()),
- StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) {
+ StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
+ RegionLiveOuts(this, /*IsLiveOut=*/true) {
LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
if (RelaxedOcc) {
@@ -613,6 +662,14 @@ GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
return RPTracker.moveMaxPressure();
}
+static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
+ MachineBasicBlock::iterator RegionEnd) {
+ auto REnd = RegionEnd == RegionBegin->getParent()->end()
+ ? std::prev(RegionEnd)
+ : RegionEnd;
+ return &*skipDebugInstructionsBackward(REnd, RegionBegin);
+}
+
void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
const MachineBasicBlock *MBB) {
GCNDownwardRPTracker RPTracker(*LIS);
@@ -687,20 +744,45 @@ void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
}
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
-GCNScheduleDAGMILive::getBBLiveInMap() const {
+GCNScheduleDAGMILive::getRegionLiveInMap() const {
assert(!Regions.empty());
- std::vector<MachineInstr *> BBStarters;
- BBStarters.reserve(Regions.size());
+ std::vector<MachineInstr *> RegionFirstMIs;
+ RegionFirstMIs.reserve(Regions.size());
auto I = Regions.rbegin(), E = Regions.rend();
auto *BB = I->first->getParent();
do {
auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
- BBStarters.push_back(MI);
+ RegionFirstMIs.push_back(MI);
do {
++I;
} while (I != E && I->first->getParent() == BB);
} while (I != E);
- return getLiveRegMap(BBStarters, false /*After*/, *LIS);
+ return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
+}
+
+DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
+GCNScheduleDAGMILive::getRegionLiveOutMap() const {
+ assert(!Regions.empty());
+ std::vector<MachineInstr *> RegionLastMIs;
+ RegionLastMIs.reserve(Regions.size());
+ for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
+ RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
+
+ return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
+}
+
+void RegionPressureMap::buildLiveRegMap() {
+ IdxToInstruction.clear();
+
+ RegionLiveRegMap =
+ IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
+ for (unsigned I = 0; I < DAG->Regions.size(); I++) {
+ MachineInstr *RegionKey =
+ IsLiveOut
+ ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second)
+ : &*DAG->Regions[I].first;
+ IdxToInstruction[I] = RegionKey;
+ }
}
void GCNScheduleDAGMILive::finalizeSchedule() {
@@ -726,8 +808,11 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
void GCNScheduleDAGMILive::runSchedStages() {
LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
- if (!Regions.empty())
- BBLiveInMap = getBBLiveInMap();
+ if (!Regions.empty()) {
+ BBLiveInMap = getRegionLiveInMap();
+ if (GCNTrackers)
+ RegionLiveOuts.buildLiveRegMap();
+ }
GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
while (S.advanceStage()) {
@@ -745,6 +830,19 @@ void GCNScheduleDAGMILive::runSchedStages() {
continue;
}
+ if (GCNTrackers) {
+ GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
+ GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
+ GCNRPTracker::LiveRegSet *RegionLiveIns =
+ &LiveIns[Stage->getRegionIdx()];
+
+ reinterpret_cast<GCNRPTracker *>(DownwardTracker)
+ ->reset(MRI, *RegionLiveIns);
+ reinterpret_cast<GCNRPTracker *>(UpwardTracker)
+ ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
+ Stage->getRegionIdx()));
+ }
+
ScheduleDAGMILive::schedule();
Stage->finalizeGCNRegion();
}
@@ -1015,6 +1113,7 @@ void GCNSchedStage::finalizeGCNRegion() {
void GCNSchedStage::checkScheduling() {
// Check the results of scheduling.
PressureAfter = DAG.getRealRegPressure(RegionIdx);
+
LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
@@ -1586,6 +1685,9 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
DAG.Regions = NewRegions;
DAG.RescheduleRegions = NewRescheduleRegions;
+ if (GCNTrackers)
+ DAG.RegionLiveOuts.buildLiveRegMap();
+
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index f0aea2bc4ab865..64d517038f90e0 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -70,6 +70,12 @@ class GCNSchedStrategy : public GenericScheduler {
// Pointer to the current SchedStageID.
SmallVectorImpl<GCNSchedStageID>::iterator CurrentStage = nullptr;
+ // GCN RP Tracker for top-down scheduling
+ mutable GCNDownwardRPTracker DownwardTracker;
+
+ // GCN RP Tracker for botttom-up scheduling
+ mutable GCNUpwardRPTracker UpwardTracker;
+
public:
// schedule() have seen register pressure over the critical limits and had to
// track register pressure for actual scheduling heuristics.
@@ -102,6 +108,8 @@ class GCNSchedStrategy : public GenericScheduler {
SUnit *pickNode(bool &IsTopNode) override;
+ void schedNode(SUnit *SU, bool IsTopNode) override;
+
void initialize(ScheduleDAGMI *DAG) override;
unsigned getTargetOccupancy() { return TargetOccupancy; }
@@ -116,13 +124,18 @@ class GCNSchedStrategy : public GenericScheduler {
bool hasNextStage() const;
GCNSchedStageID getNextStage() const;
+
+ GCNDownwardRPTracker *getDownwardTracker() { return &DownwardTracker; }
+
+ GCNUpwardRPTracker *getUpwardTracker() { return &UpwardTracker; }
};
/// The goal of this scheduling strategy is to maximize kernel occupancy (i.e.
/// maximum number of waves per simd).
class GCNMaxOccupancySchedStrategy final : public GCNSchedStrategy {
public:
- GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
+ GCNMaxOccupancySchedStrategy(const MachineSchedContext *C,
+ bool IsLegacyScheduler = false);
};
/// The goal of this scheduling strategy is to maximize ILP for a single wave
@@ -163,6 +176,32 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) {
return OS;
}
+class GCNScheduleDAGMILive;
+class RegionPressureMap {
+ GCNScheduleDAGMILive *DAG;
+ // The live in/out pressure as indexed by the first or last MI in the region
+ // before scheduling.
+ DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> RegionLiveRegMap;
+ // The mapping of RegionIDx to key instruction
+ DenseMap<unsigned, MachineInstr *> IdxToInstruction;
+ // Whether we are calculating LiveOuts or LiveIns
+ bool IsLiveOut;
+
+public:
+ RegionPressureMap() {}
+ RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut)
+ : DAG(GCNDAG), IsLiveOut(LiveOut) {}
+ // Build the Instr->LiveReg and RegionIdx->Instr maps
+ void buildLiveRegMap();
+
+ // Retrieve the LiveReg for a given RegionIdx
+ GCNRPTracker::LiveRegSet &getLiveRegsForRegionIdx(unsigned RegionIdx) {
+ assert(IdxToInstruction.find(RegionIdx) != IdxToInstruction.end());
+ MachineInstr *Key = IdxToInstruction[RegionIdx];
+ return RegionLiveRegMap[Key];
+ }
+};
+
class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
friend class GCNSchedStage;
friend class OccInitialScheduleStage;
@@ -170,6 +209,7 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
friend class ClusteredLowOccStage;
friend class PreRARematStage;
friend class ILPInitialScheduleStage;
+ friend class RegionPressureMap;
const GCNSubtarget &ST;
@@ -211,9 +251,22 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// Temporary basic block live-in cache.
DenseMap<const MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBLiveIns;
+ // The map of the initial first region instruction to region live in registers
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
- DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
+ // Calculate the map of the initial first region instruction to region live in
+ // registers
+ DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getRegionLiveInMap() const;
+
+ // Calculate the map of the initial last region instruction to region live out
+ // registers
+ DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
+ getRegionLiveOutMap() const;
+
+ // The live out registers per region. These are internally stored as a map of
+ // the initial last region instruction to region live out registers, but can
+ // be retreived with the regionIdx by calls to getLiveRegsForRegionIdx.
+ RegionPressureMap RegionLiveOuts;
// Return current region pressure.
GCNRegPressure getRealRegPressure(unsigned RegionIdx) const;
@@ -311,6 +364,9 @@ class GCNSchedStage {
return DAG.RegionsWithExcessRP[RegionIdx];
}
+ // The region number this stage is currently working on
+ unsigned getRegionIdx() { return RegionIdx; }
+
// Returns true if the new schedule may result in more spilling.
bool mayCauseSpilling(unsigned WavesAfter);
diff --git a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir
index e9005e94ce5db7..d57450baea911a 100644
--- a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir
+++ b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir
@@ -1,11 +1,17 @@
# REQUIRES: asserts
-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -amdgpu-use-amdgpu-trackers=1 -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN-GCNTRACKER %s
--- |
define amdgpu_kernel void @high-RP-reschedule() { ret void }
...
-# CHECK: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4
+# GCN: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4
+
+# GCN-GCNTRACKER: Occupancy before scheduling: 3, after 4.
+# GCN-GCNTRACKER-NEXT: Ending scheduling stage: Max Occupancy Initial Schedule
+
+# When using the GCN Trackers, the scheduler is able to acieve desired occupancy without running high-RP-reschedule stage.
---
name: high-RP-reschedule
diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir
index 4be102f7860eab..f496a4b06bb237 100644
--- a/llvm/test/CodeGen/AMDGPU/pr51516.mir
+++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir
@@ -1,4 +1,5 @@
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
# Check that %3 was not rematerialized before the last store since its operand %1
# is killed by that store.
@@ -7,6 +8,9 @@
# GCN: renamable $vgpr33_vgpr34_vgpr35_vgpr36 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr47, killed renamable $vgpr29_vgpr30_vgpr31_vgpr32, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr46
+# GCN-GCNTRACKER-LABEL: name: global_sextload_v32i32_to_v32i64
+# GCN-GCNTRACKER-NOT: SI_SPILL
+
---
name: global_sextload_v32i32_to_v32i64
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll
new file mode 100644
index 00000000000000..79187f51af0d2b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll
@@ -0,0 +1,65 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack -amdgpu-use-amdgpu-trackers=1 2>&1 < %s | FileCheck -check-prefixes=ERR-GCNTRACKERS %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack 2>&1 < %s | FileCheck -check-prefixes=GCN %s
+
+%asm.output = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs
+ <16 x i32>, <7 x i32>, ; vgprs
+ i64 ; vcc
+ }
+
+%asm.output2 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs
+ <16 x i32>, <5 x i32>, ; vgprs
+ i64 ; vcc
+ }
+
+%asm.output3 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, ; sgprs
+ <16 x i32>, <6 x i32>, ; vgprs
+ i64 ; vcc
+ }
+
+; ERR-GCNTRACKERS: ran out of registers during register allocation
+; GCN-NOT: ran out of registers during register allocation
+
+; FIXME: GCN Trackers do not track pressure from PhysRegs, so scheduling is actually worse
+
+define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 {
+ %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+ %alloca1 = alloca i32, align 4, addrspace(5)
+ call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+
+ %asm = call %asm.output asm sideeffect
+ "; def $0, $1, $2, $3, $4, $5, $6, $7, $8",
+ "={s[0:15]},={s[16:31]},={s[32:47]},={s[48:55]},={s[56:57]},={s58},={v[0:15]},={v[16:22]},={vcc}"()
+
+ %s0 = extractvalue %asm.output %asm, 0
+ %s1 = extractvalue %asm.output %asm, 1
+ %s2 = extractvalue %asm.output %asm, 2
+ %s3 = extractvalue %asm.output %asm, 3
+ %s4 = extractvalue %asm.output %asm, 4
+ %s5 = extractvalue %asm.output %asm, 5
+
+ %v0 = extractvalue %asm.output %asm, 6
+ %v1 = extractvalue %asm.output %asm, 7
+
+ %vcc = extractvalue %asm.output %asm, 8
+
+ ; scc is unavailable since it is live in
+ call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10",
+ "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:22]},{vcc},{s59},{scc}"(
+ <16 x i32> %s0,
+ <16 x i32> %s1,
+ <16 x i32> %s2,
+ <8 x i32> %s3,
+ <2 x i32> %s4,
+ i32 %s5,
+ <16 x i32> %v0,
+ <7 x i32> %v1,
+ i64 %vcc,
+ ptr addrspace(5) %alloca1,
+ i32 0) ; use of scc
+
+ ret void
+}
+
+attributes #0 = { nounwind alignstack=64 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" }
+attributes #1 = { nounwind alignstack=16 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" }
+
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
new file mode 100644
index 00000000000000..c490c76f4531de
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
@@ -0,0 +1,491 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 -amdgpu-use-amdgpu-trackers=1 < %s | FileCheck --check-prefix=GCN-GCNTRACKERS %s
+
+; CHECK-LABEL: {{^}}spill:
+; GCN: codeLenInByte = 1000
+; GCN-GCNTRACKERS: codeLenInByte = 1016
+; GCN: NumSgprs: 104
+; GCN-GCNTRACKERS: NumSgprs: 104
+; GCN: NumVgprs: 1
+; GCN-GCNTRACKERS: NumVgprs: 2
+; GCN: ScratchSize: 0
+; GCN-GCNTRACKERS: ScratchSize: 0
+; GCN: Occupancy: 5
+; GCN-GCNTRACKERS: Occupancy: 5
+
+; FIXME: GCN Trackers do not track pressure from PhysRegs, so scheduling is actually worse
+
+define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 {
+entry:
+ %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #0
+ %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #0
+ %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #0
+ %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #0
+ %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #0
+ %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #0
+ %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #0
+ %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #0
+ %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #0
+ %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #0
+ %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #0
+ %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #0
+ %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #0
+ %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #0
+ %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #0
+ %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #0
+ %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #0
+ %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #0
+ %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #0
+ %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #0
+ %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #0
+ %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #0
+ %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #0
+ %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #0
+ %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #0
+ %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #0
+ %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #0
+ %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #0
+ %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #0
+ %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #0
+ %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #0
+ %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #0
+ %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #0
+ %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #0
+ %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #0
+ %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #0
+ %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #0
+ %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #0
+ %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #0
+ %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #0
+ %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #0
+ %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #0
+ %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #0
+ %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #0
+ %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #0
+ %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #0
+ %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #0
+ %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #0
+ %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #0
+ %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #0
+ %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #0
+ %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #0
+ %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #0
+ %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #0
+ %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #0
+ %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #0
+ %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #0
+ %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #0
+ %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #0
+ %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #0
+ %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #0
+ %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #0
+ %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #0
+ %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #0
+ %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #0
+ %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #0
+ %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #0
+ %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #0
+ %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #0
+ %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #0
+ %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #0
+ %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #0
+ %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #0
+ %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #0
+ %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #0
+ %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #0
+ %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #0
+ %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #0
+ %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #0
+ %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #0
+ %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #0
+ %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #0
+ %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #0
+ %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #0
+ %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #0
+ %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #0
+ %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #0
+ %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #0
+ %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #0
+ %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #0
+ %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #0
+ %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #0
+ %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #0
+ %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #0
+ %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #0
+ %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #0
+ %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #0
+ %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #0
+ %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #0
+ %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #0
+ %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #0
+ %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #0
+ %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #0
+ %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #0
+ %cmp = icmp eq i32 %cnd, 0
+ br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
+
+bb2: ; 68 bytes
+ ; 64 byte asm
+ call void asm sideeffect
+ "v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64",""() #0
+ br label %bb3
+
+bb3:
+ tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #0
+ tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #0
+ tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #0
+ tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #0
+ tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #0
+ tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #0
+ tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #0
+ tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #0
+ tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #0
+ tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #0
+ tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #0
+ tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #0
+ tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #0
+ tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #0
+ tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #0
+ tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #0
+ tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #0
+ tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #0
+ tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #0
+ tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #0
+ tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #0
+ tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #0
+ tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #0
+ tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #0
+ tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #0
+ tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #0
+ tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #0
+ tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #0
+ tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #0
+ tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #0
+ tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #0
+ tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #0
+ tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #0
+ tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #0
+ tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #0
+ tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #0
+ tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #0
+ tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #0
+ tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #0
+ tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #0
+ tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #0
+ tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #0
+ tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #0
+ tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #0
+ tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #0
+ tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #0
+ tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #0
+ tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #0
+ tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #0
+ tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #0
+ tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #0
+ tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #0
+ tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #0
+ tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #0
+ tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #0
+ tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #0
+ tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #0
+ tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #0
+ tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #0
+ tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #0
+ tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #0
+ tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #0
+ tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #0
+ tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #0
+ tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #0
+ tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #0
+ tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #0
+ tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #0
+ tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #0
+ tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #0
+ tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #0
+ tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #0
+ tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #0
+ tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #0
+ tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #0
+ tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #0
+ tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #0
+ tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #0
+ tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #0
+ tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #0
+ tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #0
+ tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #0
+ tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #0
+ tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #0
+ tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #0
+ tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #0
+ tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #0
+ tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #0
+ tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #0
+ tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #0
+ tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #0
+ tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #0
+ tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #0
+ tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #0
+ tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #0
+ tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #0
+ tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #0
+ tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #0
+ tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #0
+ tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #0
+ tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #0
+ tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #0
+ tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #0
+ tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #0
+ ret void
+}
+
+; CHECK-LABEL: {{^}}spill_func:
+; GCN: codeLenInByte = 1612
+; GCN-GCNTRACKERS: codeLenInByte = 1660
+; GCN: NumSgprs: 104
+; GCN-GCNTRACKERS: NumSgprs: 104
+; GCN: NumVgprs: 3
+; GCN-GCNTRACKERS: NumVgprs: 4
+; GCN: ScratchSize: 12
+; GCN-GCNTRACKERS: ScratchSize: 16
+
+define void @spill_func(ptr addrspace(1) %arg) #0 {
+entry:
+ %cnd = tail call i32 @llvm.amdgcn.workgroup.id.x() #0
+ %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #0
+ %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #0
+ %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #0
+ %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #0
+ %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #0
+ %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #0
+ %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #0
+ %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #0
+ %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #0
+ %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #0
+ %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #0
+ %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #0
+ %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #0
+ %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #0
+ %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #0
+ %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #0
+ %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #0
+ %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #0
+ %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #0
+ %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #0
+ %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #0
+ %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #0
+ %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #0
+ %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #0
+ %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #0
+ %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #0
+ %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #0
+ %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #0
+ %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #0
+ %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #0
+ %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #0
+ %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #0
+ %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #0
+ %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #0
+ %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #0
+ %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #0
+ %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #0
+ %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #0
+ %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #0
+ %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #0
+ %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #0
+ %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #0
+ %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #0
+ %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #0
+ %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #0
+ %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #0
+ %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #0
+ %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #0
+ %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #0
+ %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #0
+ %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #0
+ %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #0
+ %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #0
+ %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #0
+ %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #0
+ %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #0
+ %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #0
+ %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #0
+ %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #0
+ %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #0
+ %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #0
+ %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #0
+ %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #0
+ %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #0
+ %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #0
+ %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #0
+ %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #0
+ %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #0
+ %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #0
+ %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #0
+ %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #0
+ %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #0
+ %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #0
+ %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #0
+ %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #0
+ %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #0
+ %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #0
+ %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #0
+ %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #0
+ %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #0
+ %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #0
+ %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #0
+ %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #0
+ %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #0
+ %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #0
+ %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #0
+ %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #0
+ %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #0
+ %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #0
+ %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #0
+ %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #0
+ %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #0
+ %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #0
+ %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #0
+ %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #0
+ %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #0
+ %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #0
+ %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #0
+ %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #0
+ %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #0
+ %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #0
+ %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #0
+ %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #0
+ %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #0
+ %cmp = icmp eq i32 %cnd, 0
+ br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
+
+bb2: ; 68 bytes
+ ; 64 byte asm
+ call void asm sideeffect
+ "v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64",""() #0
+ br label %bb3
+
+bb3:
+ tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #0
+ tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #0
+ tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #0
+ tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #0
+ tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #0
+ tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #0
+ tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #0
+ tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #0
+ tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #0
+ tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #0
+ tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #0
+ tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #0
+ tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #0
+ tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #0
+ tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #0
+ tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #0
+ tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #0
+ tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #0
+ tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #0
+ tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #0
+ tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #0
+ tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #0
+ tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #0
+ tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #0
+ tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #0
+ tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #0
+ tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #0
+ tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #0
+ tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #0
+ tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #0
+ tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #0
+ tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #0
+ tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #0
+ tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #0
+ tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #0
+ tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #0
+ tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #0
+ tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #0
+ tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #0
+ tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #0
+ tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #0
+ tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #0
+ tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #0
+ tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #0
+ tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #0
+ tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #0
+ tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #0
+ tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #0
+ tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #0
+ tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #0
+ tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #0
+ tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #0
+ tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #0
+ tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #0
+ tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #0
+ tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #0
+ tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #0
+ tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #0
+ tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #0
+ tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #0
+ tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #0
+ tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #0
+ tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #0
+ tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #0
+ tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #0
+ tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #0
+ tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #0
+ tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #0
+ tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #0
+ tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #0
+ tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #0
+ tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #0
+ tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #0
+ tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #0
+ tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #0
+ tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #0
+ tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #0
+ tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #0
+ tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #0
+ tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #0
+ tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #0
+ tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #0
+ tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #0
+ tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #0
+ tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #0
+ tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #0
+ tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #0
+ tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #0
+ tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #0
+ tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #0
+ tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #0
+ tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #0
+ tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #0
+ tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #0
+ tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #0
+ tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #0
+ tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #0
+ tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #0
+ tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #0
+ tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #0
+ tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #0
+ tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #0
+ tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #0
+ tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #0
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
new file mode 100644
index 00000000000000..53f533ebb28427
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -0,0 +1,647 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL-GCNTRACKERS %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -verify-misched < %s | FileCheck --check-prefixes=TONGA %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=TONGA-GCNTRACKERS %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched < %s | FileCheck --check-prefixes=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX908-GCNTRACKERS %s
+; RUN: llc -mtriple=amdgcn -verify-misched < %s | FileCheck --check-prefixes=GENERIC %s
+; RUN: llc -mtriple=amdgcn -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GENERIC-GCNTRACKERS %s
+
+; GCN Trackers are sensitive to minor changes in RP, and will avoid scheduling certain instructions, which, if scheduled,
+; allow scheduling of other instructions which reduce RP
+
+; CHECK-LABEL: {{^}}return_72xi32:
+; GFX11-PAL: codeLenInByte = 768
+; GFX11-PAL-GCNTRACKERS: codeLenInByte = 888
+; GFX11-PAL: NumSgprs: 33
+; GFX11-PAL-GCNTRACKERS: NumSgprs: 33
+; GFX11-PAL: NumVgprs: 64
+; GFX11-PAL-GCNTRACKERS: NumVgprs: 64
+; GFX11-PAL: ScratchSize: 220
+; GFX11-PAL-GCNTRACKERS: ScratchSize: 248
+
+
+; CHECK-LABEL: {{^}}call_72xi32:
+; GFX11-PAL: codeLenInByte = 1300
+; GFX11-PAL-GCNTRACKERS: codeLenInByte = 1372
+; GFX11-PAL: NumSgprs: 35
+; GFX11-PAL-GCNTRACKERS: NumSgprs: 35
+; GFX11-PAL: NumVgprs: 64
+; GFX11-PAL-GCNTRACKERS: NumVgprs: 64
+; GFX11-PAL: ScratchSize: 2780
+; GFX11-PAL-GCNTRACKERS: ScratchSize: 2808
+
+
+define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
+ ret <72 x i32> %val
+}
+
+define amdgpu_gfx void @call_72xi32() #1 {
+entry:
+ %ret.0 = call amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> zeroinitializer)
+ %val.0 = insertelement <72 x i32> %ret.0, i32 42, i32 0
+ %val.1 = insertelement <72 x i32> %val.0, i32 24, i32 58
+ %ret.1 = call amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val.1)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}global_extload_v16f16_to_v16f64:
+; TONGA: codeLenInByte = 420
+; TONGA-GCNTRACKERS: codeLenInByte = 436
+; TONGA: NumSgprs: 96
+; TONGA-GCNTRACKERS: NumSgprs: 96
+; TONGA: NumVgprs: 33
+; TONGA-GCNTRACKERS: NumVgprs: 25
+; TONGA: Occupancy: 7
+; TONGA-GCNTRACKERS: Occupancy: 8
+
+
+define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+ %val = load <16 x half>, ptr addrspace(1) %in
+ %cvt = fpext <16 x half> %val to <16 x double>
+ store <16 x double> %cvt, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}constant_zextload_v64i16_to_v64i32:
+; GENERIC: codeLenInByte = 860
+; GENERIC-GCNTRACKERS: codeLenInByte = 860
+; GENERIC: NumSgprs: 71
+; GENERIC-GCNTRACKERS: NumSgprs: 54
+; GENERIC: NumVgprs: 16
+; GENERIC-GCNTRACKERS: NumVgprs: 16
+; GENERIC: Occupancy: 7
+; GENERIC-GCNTRACKERS: Occupancy: 8
+
+define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
+ %load = load <64 x i16>, ptr addrspace(4) %in
+ %ext = zext <64 x i16> %load to <64 x i32>
+ store <64 x i32> %ext, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure:
+; GFX908: codeLenInByte = 1436
+; GFX908-GCNTRACKERS: codeLenInByte = 1436
+; GFX908: NumSgprs: 56
+; GFX908-GCNTRACKERS: NumSgprs: 56
+; GFX908: NumVgprs: 43
+; GFX908-GCNTRACKERS: NumVgprs: 39
+; GFX908: Occupancy: 5
+; GFX908-GCNTRACKERS: Occupancy: 6
+
+
+define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) {
+entry:
+ %i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %i2 = load i64, ptr addrspace(4) %i, align 8
+ %i3 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+ %i4 = shl i32 %i3, 8
+ %i5 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !5
+ %i6 = add i32 %i4, %i5
+ %i7 = trunc i64 %i2 to i32
+ %conv = add i32 %i6, %i7
+ %conv.frozen = freeze i32 %conv
+ %div = udiv i32 %conv.frozen, 49
+ %add.ptr22 = getelementptr inbounds float, ptr addrspace(4) %wei_ptr, i64 undef
+ %in.ptr1 = getelementptr inbounds float, ptr addrspace(1) %in, i32 %i5
+ br label %for.cond28.preheader
+
+for.cond28.preheader: ; preds = %for.cond28.preheader, %entry
+ %accum.sroa.110.0 = phi float [ 0.000000e+00, %entry ], [ %i251, %for.cond28.preheader ]
+ %accum.sroa.106.0 = phi float [ 0.000000e+00, %entry ], [ %i247, %for.cond28.preheader ]
+ %accum.sroa.102.0 = phi float [ 0.000000e+00, %entry ], [ %i243, %for.cond28.preheader ]
+ %accum.sroa.98.0 = phi float [ 0.000000e+00, %entry ], [ %i239, %for.cond28.preheader ]
+ %accum.sroa.94.0 = phi float [ 0.000000e+00, %entry ], [ %i235, %for.cond28.preheader ]
+ %accum.sroa.90.0 = phi float [ 0.000000e+00, %entry ], [ %i231, %for.cond28.preheader ]
+ %accum.sroa.86.0 = phi float [ 0.000000e+00, %entry ], [ %i227, %for.cond28.preheader ]
+ %accum.sroa.82.0 = phi float [ 0.000000e+00, %entry ], [ %i223, %for.cond28.preheader ]
+ %accum.sroa.78.0 = phi float [ 0.000000e+00, %entry ], [ %i219, %for.cond28.preheader ]
+ %accum.sroa.74.0 = phi float [ 0.000000e+00, %entry ], [ %i215, %for.cond28.preheader ]
+ %accum.sroa.70.0 = phi float [ 0.000000e+00, %entry ], [ %i211, %for.cond28.preheader ]
+ %accum.sroa.66.0 = phi float [ 0.000000e+00, %entry ], [ %i207, %for.cond28.preheader ]
+ %accum.sroa.62.0 = phi float [ 0.000000e+00, %entry ], [ %i203, %for.cond28.preheader ]
+ %accum.sroa.58.0 = phi float [ 0.000000e+00, %entry ], [ %i199, %for.cond28.preheader ]
+ %accum.sroa.54.0 = phi float [ 0.000000e+00, %entry ], [ %i195, %for.cond28.preheader ]
+ %accum.sroa.50.0 = phi float [ 0.000000e+00, %entry ], [ %i191, %for.cond28.preheader ]
+ %accum.sroa.46.0 = phi float [ 0.000000e+00, %entry ], [ %i187, %for.cond28.preheader ]
+ %accum.sroa.42.0 = phi float [ 0.000000e+00, %entry ], [ %i183, %for.cond28.preheader ]
+ %accum.sroa.38.0 = phi float [ 0.000000e+00, %entry ], [ %i179, %for.cond28.preheader ]
+ %accum.sroa.34.0 = phi float [ 0.000000e+00, %entry ], [ %i175, %for.cond28.preheader ]
+ %accum.sroa.30.0 = phi float [ 0.000000e+00, %entry ], [ %i171, %for.cond28.preheader ]
+ %accum.sroa.26.0 = phi float [ 0.000000e+00, %entry ], [ %i167, %for.cond28.preheader ]
+ %accum.sroa.22.0 = phi float [ 0.000000e+00, %entry ], [ %i163, %for.cond28.preheader ]
+ %accum.sroa.18.0 = phi float [ 0.000000e+00, %entry ], [ %i159, %for.cond28.preheader ]
+ %accum.sroa.14.0 = phi float [ 0.000000e+00, %entry ], [ %i155, %for.cond28.preheader ]
+ %accum.sroa.10.0 = phi float [ 0.000000e+00, %entry ], [ %i151, %for.cond28.preheader ]
+ %accum.sroa.6.0 = phi float [ 0.000000e+00, %entry ], [ %i147, %for.cond28.preheader ]
+ %accum.sroa.0.0 = phi float [ 0.000000e+00, %entry ], [ %i143, %for.cond28.preheader ]
+ %accum.sroa.114.0 = phi float [ 0.000000e+00, %entry ], [ %i255, %for.cond28.preheader ]
+ %accum.sroa.118.0 = phi float [ 0.000000e+00, %entry ], [ %i259, %for.cond28.preheader ]
+ %accum.sroa.122.0 = phi float [ 0.000000e+00, %entry ], [ %i263, %for.cond28.preheader ]
+ %accum.sroa.126.0 = phi float [ 0.000000e+00, %entry ], [ %i267, %for.cond28.preheader ]
+ %i_ptr.0288 = phi ptr addrspace(1) [ %in.ptr1, %entry ], [ %add.ptr47.3, %for.cond28.preheader ]
+ %w_ptr.0287 = phi ptr addrspace(4) [ %add.ptr22, %entry ], [ %add.ptr74, %for.cond28.preheader ]
+ %ci.0286 = phi i32 [ 0, %entry ], [ %inc116, %for.cond28.preheader ]
+ %i8 = load float, ptr addrspace(1) %i_ptr.0288, align 4
+ %add.ptr47 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 49
+ %i9 = load float, ptr addrspace(1) %add.ptr47, align 4
+ %add.ptr47.1 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 98
+ %i10 = load float, ptr addrspace(1) %add.ptr47.1, align 4
+ %add.ptr47.2 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 147
+ %i11 = load float, ptr addrspace(1) %add.ptr47.2, align 4
+ %i12 = load float, ptr addrspace(4) %w_ptr.0287, align 4
+ %add.ptr66 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1024
+ %i13 = load float, ptr addrspace(4) %add.ptr66, align 4
+ %add.ptr66.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2048
+ %i14 = load float, ptr addrspace(4) %add.ptr66.1, align 4
+ %add.ptr66.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3072
+ %i15 = load float, ptr addrspace(4) %add.ptr66.2, align 4
+ %add.ptr70 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1
+ %i16 = load float, ptr addrspace(4) %add.ptr70, align 4
+ %add.ptr66.1291 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1025
+ %i17 = load float, ptr addrspace(4) %add.ptr66.1291, align 4
+ %add.ptr66.1.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2049
+ %i18 = load float, ptr addrspace(4) %add.ptr66.1.1, align 4
+ %add.ptr66.2.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3073
+ %i19 = load float, ptr addrspace(4) %add.ptr66.2.1, align 4
+ %add.ptr70.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2
+ %i20 = load float, ptr addrspace(4) %add.ptr70.1, align 4
+ %add.ptr66.2293 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1026
+ %i21 = load float, ptr addrspace(4) %add.ptr66.2293, align 4
+ %add.ptr66.1.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2050
+ %i22 = load float, ptr addrspace(4) %add.ptr66.1.2, align 4
+ %add.ptr66.2.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3074
+ %i23 = load float, ptr addrspace(4) %add.ptr66.2.2, align 4
+ %add.ptr70.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3
+ %i24 = load float, ptr addrspace(4) %add.ptr70.2, align 4
+ %add.ptr66.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1027
+ %i25 = load float, ptr addrspace(4) %add.ptr66.3, align 4
+ %add.ptr66.1.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2051
+ %i26 = load float, ptr addrspace(4) %add.ptr66.1.3, align 4
+ %add.ptr66.2.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3075
+ %i27 = load float, ptr addrspace(4) %add.ptr66.2.3, align 4
+ %add.ptr70.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 4
+ %i28 = load float, ptr addrspace(4) %add.ptr70.3, align 4
+ %add.ptr66.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1028
+ %i29 = load float, ptr addrspace(4) %add.ptr66.4, align 4
+ %add.ptr66.1.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2052
+ %i30 = load float, ptr addrspace(4) %add.ptr66.1.4, align 4
+ %add.ptr66.2.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3076
+ %i31 = load float, ptr addrspace(4) %add.ptr66.2.4, align 4
+ %add.ptr70.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 5
+ %i32 = load float, ptr addrspace(4) %add.ptr70.4, align 4
+ %add.ptr66.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1029
+ %i33 = load float, ptr addrspace(4) %add.ptr66.5, align 4
+ %add.ptr66.1.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2053
+ %i34 = load float, ptr addrspace(4) %add.ptr66.1.5, align 4
+ %add.ptr66.2.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3077
+ %i35 = load float, ptr addrspace(4) %add.ptr66.2.5, align 4
+ %add.ptr70.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 6
+ %i36 = load float, ptr addrspace(4) %add.ptr70.5, align 4
+ %add.ptr66.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1030
+ %i37 = load float, ptr addrspace(4) %add.ptr66.6, align 4
+ %add.ptr66.1.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2054
+ %i38 = load float, ptr addrspace(4) %add.ptr66.1.6, align 4
+ %add.ptr66.2.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3078
+ %i39 = load float, ptr addrspace(4) %add.ptr66.2.6, align 4
+ %add.ptr70.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 7
+ %i40 = load float, ptr addrspace(4) %add.ptr70.6, align 4
+ %add.ptr66.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1031
+ %i41 = load float, ptr addrspace(4) %add.ptr66.7, align 4
+ %add.ptr66.1.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2055
+ %i42 = load float, ptr addrspace(4) %add.ptr66.1.7, align 4
+ %add.ptr66.2.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3079
+ %i43 = load float, ptr addrspace(4) %add.ptr66.2.7, align 4
+ %add.ptr70.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 8
+ %i44 = load float, ptr addrspace(4) %add.ptr70.7, align 4
+ %add.ptr66.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1032
+ %i45 = load float, ptr addrspace(4) %add.ptr66.8, align 4
+ %add.ptr66.1.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2056
+ %i46 = load float, ptr addrspace(4) %add.ptr66.1.8, align 4
+ %add.ptr66.2.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3080
+ %i47 = load float, ptr addrspace(4) %add.ptr66.2.8, align 4
+ %add.ptr70.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 9
+ %i48 = load float, ptr addrspace(4) %add.ptr70.8, align 4
+ %add.ptr66.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1033
+ %i49 = load float, ptr addrspace(4) %add.ptr66.9, align 4
+ %add.ptr66.1.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2057
+ %i50 = load float, ptr addrspace(4) %add.ptr66.1.9, align 4
+ %add.ptr66.2.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3081
+ %i51 = load float, ptr addrspace(4) %add.ptr66.2.9, align 4
+ %add.ptr70.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 10
+ %i52 = load float, ptr addrspace(4) %add.ptr70.9, align 4
+ %add.ptr66.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1034
+ %i53 = load float, ptr addrspace(4) %add.ptr66.10, align 4
+ %add.ptr66.1.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2058
+ %i54 = load float, ptr addrspace(4) %add.ptr66.1.10, align 4
+ %add.ptr66.2.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3082
+ %i55 = load float, ptr addrspace(4) %add.ptr66.2.10, align 4
+ %add.ptr70.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 11
+ %i56 = load float, ptr addrspace(4) %add.ptr70.10, align 4
+ %add.ptr66.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1035
+ %i57 = load float, ptr addrspace(4) %add.ptr66.11, align 4
+ %add.ptr66.1.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2059
+ %i58 = load float, ptr addrspace(4) %add.ptr66.1.11, align 4
+ %add.ptr66.2.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3083
+ %i59 = load float, ptr addrspace(4) %add.ptr66.2.11, align 4
+ %add.ptr70.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 12
+ %i60 = load float, ptr addrspace(4) %add.ptr70.11, align 4
+ %add.ptr66.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1036
+ %i61 = load float, ptr addrspace(4) %add.ptr66.12, align 4
+ %add.ptr66.1.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2060
+ %i62 = load float, ptr addrspace(4) %add.ptr66.1.12, align 4
+ %add.ptr66.2.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3084
+ %i63 = load float, ptr addrspace(4) %add.ptr66.2.12, align 4
+ %add.ptr70.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 13
+ %i64 = load float, ptr addrspace(4) %add.ptr70.12, align 4
+ %add.ptr66.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1037
+ %i65 = load float, ptr addrspace(4) %add.ptr66.13, align 4
+ %add.ptr66.1.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2061
+ %i66 = load float, ptr addrspace(4) %add.ptr66.1.13, align 4
+ %add.ptr66.2.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3085
+ %i67 = load float, ptr addrspace(4) %add.ptr66.2.13, align 4
+ %add.ptr70.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 14
+ %i68 = load float, ptr addrspace(4) %add.ptr70.13, align 4
+ %add.ptr66.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1038
+ %i69 = load float, ptr addrspace(4) %add.ptr66.14, align 4
+ %add.ptr66.1.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2062
+ %i70 = load float, ptr addrspace(4) %add.ptr66.1.14, align 4
+ %add.ptr66.2.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3086
+ %i71 = load float, ptr addrspace(4) %add.ptr66.2.14, align 4
+ %add.ptr70.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 15
+ %i72 = load float, ptr addrspace(4) %add.ptr70.14, align 4
+ %add.ptr66.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1039
+ %i73 = load float, ptr addrspace(4) %add.ptr66.15, align 4
+ %add.ptr66.1.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2063
+ %i74 = load float, ptr addrspace(4) %add.ptr66.1.15, align 4
+ %add.ptr66.2.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3087
+ %i75 = load float, ptr addrspace(4) %add.ptr66.2.15, align 4
+ %add.ptr70.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 16
+ %i76 = load float, ptr addrspace(4) %add.ptr70.15, align 4
+ %add.ptr66.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1040
+ %i77 = load float, ptr addrspace(4) %add.ptr66.16, align 4
+ %add.ptr66.1.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2064
+ %i78 = load float, ptr addrspace(4) %add.ptr66.1.16, align 4
+ %add.ptr66.2.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3088
+ %i79 = load float, ptr addrspace(4) %add.ptr66.2.16, align 4
+ %add.ptr70.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 17
+ %i80 = load float, ptr addrspace(4) %add.ptr70.16, align 4
+ %add.ptr66.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1041
+ %i81 = load float, ptr addrspace(4) %add.ptr66.17, align 4
+ %add.ptr66.1.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2065
+ %i82 = load float, ptr addrspace(4) %add.ptr66.1.17, align 4
+ %add.ptr66.2.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3089
+ %i83 = load float, ptr addrspace(4) %add.ptr66.2.17, align 4
+ %add.ptr70.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 18
+ %i84 = load float, ptr addrspace(4) %add.ptr70.17, align 4
+ %add.ptr66.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1042
+ %i85 = load float, ptr addrspace(4) %add.ptr66.18, align 4
+ %add.ptr66.1.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2066
+ %i86 = load float, ptr addrspace(4) %add.ptr66.1.18, align 4
+ %add.ptr66.2.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3090
+ %i87 = load float, ptr addrspace(4) %add.ptr66.2.18, align 4
+ %add.ptr70.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 19
+ %i88 = load float, ptr addrspace(4) %add.ptr70.18, align 4
+ %add.ptr66.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1043
+ %i89 = load float, ptr addrspace(4) %add.ptr66.19, align 4
+ %add.ptr66.1.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2067
+ %i90 = load float, ptr addrspace(4) %add.ptr66.1.19, align 4
+ %add.ptr66.2.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3091
+ %i91 = load float, ptr addrspace(4) %add.ptr66.2.19, align 4
+ %add.ptr70.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 20
+ %i92 = load float, ptr addrspace(4) %add.ptr70.19, align 4
+ %add.ptr66.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1044
+ %i93 = load float, ptr addrspace(4) %add.ptr66.20, align 4
+ %add.ptr66.1.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2068
+ %i94 = load float, ptr addrspace(4) %add.ptr66.1.20, align 4
+ %add.ptr66.2.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3092
+ %i95 = load float, ptr addrspace(4) %add.ptr66.2.20, align 4
+ %add.ptr70.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 21
+ %i96 = load float, ptr addrspace(4) %add.ptr70.20, align 4
+ %add.ptr66.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1045
+ %i97 = load float, ptr addrspace(4) %add.ptr66.21, align 4
+ %add.ptr66.1.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2069
+ %i98 = load float, ptr addrspace(4) %add.ptr66.1.21, align 4
+ %add.ptr66.2.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3093
+ %i99 = load float, ptr addrspace(4) %add.ptr66.2.21, align 4
+ %add.ptr70.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 22
+ %i100 = load float, ptr addrspace(4) %add.ptr70.21, align 4
+ %add.ptr66.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1046
+ %i101 = load float, ptr addrspace(4) %add.ptr66.22, align 4
+ %add.ptr66.1.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2070
+ %i102 = load float, ptr addrspace(4) %add.ptr66.1.22, align 4
+ %add.ptr66.2.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3094
+ %i103 = load float, ptr addrspace(4) %add.ptr66.2.22, align 4
+ %add.ptr70.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 23
+ %i104 = load float, ptr addrspace(4) %add.ptr70.22, align 4
+ %add.ptr66.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1047
+ %i105 = load float, ptr addrspace(4) %add.ptr66.23, align 4
+ %add.ptr66.1.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2071
+ %i106 = load float, ptr addrspace(4) %add.ptr66.1.23, align 4
+ %add.ptr66.2.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3095
+ %i107 = load float, ptr addrspace(4) %add.ptr66.2.23, align 4
+ %add.ptr70.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 24
+ %i108 = load float, ptr addrspace(4) %add.ptr70.23, align 4
+ %add.ptr66.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1048
+ %i109 = load float, ptr addrspace(4) %add.ptr66.24, align 4
+ %add.ptr66.1.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2072
+ %i110 = load float, ptr addrspace(4) %add.ptr66.1.24, align 4
+ %add.ptr66.2.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3096
+ %i111 = load float, ptr addrspace(4) %add.ptr66.2.24, align 4
+ %add.ptr70.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 25
+ %i112 = load float, ptr addrspace(4) %add.ptr70.24, align 4
+ %add.ptr66.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1049
+ %i113 = load float, ptr addrspace(4) %add.ptr66.25, align 4
+ %add.ptr66.1.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2073
+ %i114 = load float, ptr addrspace(4) %add.ptr66.1.25, align 4
+ %add.ptr66.2.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3097
+ %i115 = load float, ptr addrspace(4) %add.ptr66.2.25, align 4
+ %add.ptr70.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 26
+ %i116 = load float, ptr addrspace(4) %add.ptr70.25, align 4
+ %add.ptr66.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1050
+ %i117 = load float, ptr addrspace(4) %add.ptr66.26, align 4
+ %add.ptr66.1.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2074
+ %i118 = load float, ptr addrspace(4) %add.ptr66.1.26, align 4
+ %add.ptr66.2.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3098
+ %i119 = load float, ptr addrspace(4) %add.ptr66.2.26, align 4
+ %add.ptr70.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 27
+ %i120 = load float, ptr addrspace(4) %add.ptr70.26, align 4
+ %add.ptr66.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1051
+ %i121 = load float, ptr addrspace(4) %add.ptr66.27, align 4
+ %add.ptr66.1.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2075
+ %i122 = load float, ptr addrspace(4) %add.ptr66.1.27, align 4
+ %add.ptr66.2.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3099
+ %i123 = load float, ptr addrspace(4) %add.ptr66.2.27, align 4
+ %add.ptr70.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 28
+ %i124 = load float, ptr addrspace(4) %add.ptr70.27, align 4
+ %add.ptr66.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1052
+ %i125 = load float, ptr addrspace(4) %add.ptr66.28, align 4
+ %add.ptr66.1.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2076
+ %i126 = load float, ptr addrspace(4) %add.ptr66.1.28, align 4
+ %add.ptr66.2.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3100
+ %i127 = load float, ptr addrspace(4) %add.ptr66.2.28, align 4
+ %add.ptr70.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 29
+ %i128 = load float, ptr addrspace(4) %add.ptr70.28, align 4
+ %add.ptr66.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1053
+ %i129 = load float, ptr addrspace(4) %add.ptr66.29, align 4
+ %add.ptr66.1.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2077
+ %i130 = load float, ptr addrspace(4) %add.ptr66.1.29, align 4
+ %add.ptr66.2.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3101
+ %i131 = load float, ptr addrspace(4) %add.ptr66.2.29, align 4
+ %add.ptr70.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 30
+ %i132 = load float, ptr addrspace(4) %add.ptr70.29, align 4
+ %add.ptr66.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1054
+ %i133 = load float, ptr addrspace(4) %add.ptr66.30, align 4
+ %add.ptr66.1.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2078
+ %i134 = load float, ptr addrspace(4) %add.ptr66.1.30, align 4
+ %add.ptr66.2.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3102
+ %i135 = load float, ptr addrspace(4) %add.ptr66.2.30, align 4
+ %add.ptr70.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 31
+ %i136 = load float, ptr addrspace(4) %add.ptr70.30, align 4
+ %add.ptr66.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1055
+ %i137 = load float, ptr addrspace(4) %add.ptr66.31, align 4
+ %add.ptr66.1.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2079
+ %i138 = load float, ptr addrspace(4) %add.ptr66.1.31, align 4
+ %add.ptr66.2.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3103
+ %i139 = load float, ptr addrspace(4) %add.ptr66.2.31, align 4
+ %add.ptr47.3 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 196
+ %i140 = tail call float @llvm.fmuladd.f32(float %i8, float %i12, float %accum.sroa.0.0)
+ %i141 = tail call float @llvm.fmuladd.f32(float %i9, float %i13, float %i140)
+ %i142 = tail call float @llvm.fmuladd.f32(float %i10, float %i14, float %i141)
+ %i143 = tail call float @llvm.fmuladd.f32(float %i11, float %i15, float %i142)
+ %i144 = tail call float @llvm.fmuladd.f32(float %i8, float %i16, float %accum.sroa.6.0)
+ %i145 = tail call float @llvm.fmuladd.f32(float %i9, float %i17, float %i144)
+ %i146 = tail call float @llvm.fmuladd.f32(float %i10, float %i18, float %i145)
+ %i147 = tail call float @llvm.fmuladd.f32(float %i11, float %i19, float %i146)
+ %i148 = tail call float @llvm.fmuladd.f32(float %i8, float %i20, float %accum.sroa.10.0)
+ %i149 = tail call float @llvm.fmuladd.f32(float %i9, float %i21, float %i148)
+ %i150 = tail call float @llvm.fmuladd.f32(float %i10, float %i22, float %i149)
+ %i151 = tail call float @llvm.fmuladd.f32(float %i11, float %i23, float %i150)
+ %i152 = tail call float @llvm.fmuladd.f32(float %i8, float %i24, float %accum.sroa.14.0)
+ %i153 = tail call float @llvm.fmuladd.f32(float %i9, float %i25, float %i152)
+ %i154 = tail call float @llvm.fmuladd.f32(float %i10, float %i26, float %i153)
+ %i155 = tail call float @llvm.fmuladd.f32(float %i11, float %i27, float %i154)
+ %i156 = tail call float @llvm.fmuladd.f32(float %i8, float %i28, float %accum.sroa.18.0)
+ %i157 = tail call float @llvm.fmuladd.f32(float %i9, float %i29, float %i156)
+ %i158 = tail call float @llvm.fmuladd.f32(float %i10, float %i30, float %i157)
+ %i159 = tail call float @llvm.fmuladd.f32(float %i11, float %i31, float %i158)
+ %i160 = tail call float @llvm.fmuladd.f32(float %i8, float %i32, float %accum.sroa.22.0)
+ %i161 = tail call float @llvm.fmuladd.f32(float %i9, float %i33, float %i160)
+ %i162 = tail call float @llvm.fmuladd.f32(float %i10, float %i34, float %i161)
+ %i163 = tail call float @llvm.fmuladd.f32(float %i11, float %i35, float %i162)
+ %i164 = tail call float @llvm.fmuladd.f32(float %i8, float %i36, float %accum.sroa.26.0)
+ %i165 = tail call float @llvm.fmuladd.f32(float %i9, float %i37, float %i164)
+ %i166 = tail call float @llvm.fmuladd.f32(float %i10, float %i38, float %i165)
+ %i167 = tail call float @llvm.fmuladd.f32(float %i11, float %i39, float %i166)
+ %i168 = tail call float @llvm.fmuladd.f32(float %i8, float %i40, float %accum.sroa.30.0)
+ %i169 = tail call float @llvm.fmuladd.f32(float %i9, float %i41, float %i168)
+ %i170 = tail call float @llvm.fmuladd.f32(float %i10, float %i42, float %i169)
+ %i171 = tail call float @llvm.fmuladd.f32(float %i11, float %i43, float %i170)
+ %i172 = tail call float @llvm.fmuladd.f32(float %i8, float %i44, float %accum.sroa.34.0)
+ %i173 = tail call float @llvm.fmuladd.f32(float %i9, float %i45, float %i172)
+ %i174 = tail call float @llvm.fmuladd.f32(float %i10, float %i46, float %i173)
+ %i175 = tail call float @llvm.fmuladd.f32(float %i11, float %i47, float %i174)
+ %i176 = tail call float @llvm.fmuladd.f32(float %i8, float %i48, float %accum.sroa.38.0)
+ %i177 = tail call float @llvm.fmuladd.f32(float %i9, float %i49, float %i176)
+ %i178 = tail call float @llvm.fmuladd.f32(float %i10, float %i50, float %i177)
+ %i179 = tail call float @llvm.fmuladd.f32(float %i11, float %i51, float %i178)
+ %i180 = tail call float @llvm.fmuladd.f32(float %i8, float %i52, float %accum.sroa.42.0)
+ %i181 = tail call float @llvm.fmuladd.f32(float %i9, float %i53, float %i180)
+ %i182 = tail call float @llvm.fmuladd.f32(float %i10, float %i54, float %i181)
+ %i183 = tail call float @llvm.fmuladd.f32(float %i11, float %i55, float %i182)
+ %i184 = tail call float @llvm.fmuladd.f32(float %i8, float %i56, float %accum.sroa.46.0)
+ %i185 = tail call float @llvm.fmuladd.f32(float %i9, float %i57, float %i184)
+ %i186 = tail call float @llvm.fmuladd.f32(float %i10, float %i58, float %i185)
+ %i187 = tail call float @llvm.fmuladd.f32(float %i11, float %i59, float %i186)
+ %i188 = tail call float @llvm.fmuladd.f32(float %i8, float %i60, float %accum.sroa.50.0)
+ %i189 = tail call float @llvm.fmuladd.f32(float %i9, float %i61, float %i188)
+ %i190 = tail call float @llvm.fmuladd.f32(float %i10, float %i62, float %i189)
+ %i191 = tail call float @llvm.fmuladd.f32(float %i11, float %i63, float %i190)
+ %i192 = tail call float @llvm.fmuladd.f32(float %i8, float %i64, float %accum.sroa.54.0)
+ %i193 = tail call float @llvm.fmuladd.f32(float %i9, float %i65, float %i192)
+ %i194 = tail call float @llvm.fmuladd.f32(float %i10, float %i66, float %i193)
+ %i195 = tail call float @llvm.fmuladd.f32(float %i11, float %i67, float %i194)
+ %i196 = tail call float @llvm.fmuladd.f32(float %i8, float %i68, float %accum.sroa.58.0)
+ %i197 = tail call float @llvm.fmuladd.f32(float %i9, float %i69, float %i196)
+ %i198 = tail call float @llvm.fmuladd.f32(float %i10, float %i70, float %i197)
+ %i199 = tail call float @llvm.fmuladd.f32(float %i11, float %i71, float %i198)
+ %i200 = tail call float @llvm.fmuladd.f32(float %i8, float %i72, float %accum.sroa.62.0)
+ %i201 = tail call float @llvm.fmuladd.f32(float %i9, float %i73, float %i200)
+ %i202 = tail call float @llvm.fmuladd.f32(float %i10, float %i74, float %i201)
+ %i203 = tail call float @llvm.fmuladd.f32(float %i11, float %i75, float %i202)
+ %i204 = tail call float @llvm.fmuladd.f32(float %i8, float %i76, float %accum.sroa.66.0)
+ %i205 = tail call float @llvm.fmuladd.f32(float %i9, float %i77, float %i204)
+ %i206 = tail call float @llvm.fmuladd.f32(float %i10, float %i78, float %i205)
+ %i207 = tail call float @llvm.fmuladd.f32(float %i11, float %i79, float %i206)
+ %i208 = tail call float @llvm.fmuladd.f32(float %i8, float %i80, float %accum.sroa.70.0)
+ %i209 = tail call float @llvm.fmuladd.f32(float %i9, float %i81, float %i208)
+ %i210 = tail call float @llvm.fmuladd.f32(float %i10, float %i82, float %i209)
+ %i211 = tail call float @llvm.fmuladd.f32(float %i11, float %i83, float %i210)
+ %i212 = tail call float @llvm.fmuladd.f32(float %i8, float %i84, float %accum.sroa.74.0)
+ %i213 = tail call float @llvm.fmuladd.f32(float %i9, float %i85, float %i212)
+ %i214 = tail call float @llvm.fmuladd.f32(float %i10, float %i86, float %i213)
+ %i215 = tail call float @llvm.fmuladd.f32(float %i11, float %i87, float %i214)
+ %i216 = tail call float @llvm.fmuladd.f32(float %i8, float %i88, float %accum.sroa.78.0)
+ %i217 = tail call float @llvm.fmuladd.f32(float %i9, float %i89, float %i216)
+ %i218 = tail call float @llvm.fmuladd.f32(float %i10, float %i90, float %i217)
+ %i219 = tail call float @llvm.fmuladd.f32(float %i11, float %i91, float %i218)
+ %i220 = tail call float @llvm.fmuladd.f32(float %i8, float %i92, float %accum.sroa.82.0)
+ %i221 = tail call float @llvm.fmuladd.f32(float %i9, float %i93, float %i220)
+ %i222 = tail call float @llvm.fmuladd.f32(float %i10, float %i94, float %i221)
+ %i223 = tail call float @llvm.fmuladd.f32(float %i11, float %i95, float %i222)
+ %i224 = tail call float @llvm.fmuladd.f32(float %i8, float %i96, float %accum.sroa.86.0)
+ %i225 = tail call float @llvm.fmuladd.f32(float %i9, float %i97, float %i224)
+ %i226 = tail call float @llvm.fmuladd.f32(float %i10, float %i98, float %i225)
+ %i227 = tail call float @llvm.fmuladd.f32(float %i11, float %i99, float %i226)
+ %i228 = tail call float @llvm.fmuladd.f32(float %i8, float %i100, float %accum.sroa.90.0)
+ %i229 = tail call float @llvm.fmuladd.f32(float %i9, float %i101, float %i228)
+ %i230 = tail call float @llvm.fmuladd.f32(float %i10, float %i102, float %i229)
+ %i231 = tail call float @llvm.fmuladd.f32(float %i11, float %i103, float %i230)
+ %i232 = tail call float @llvm.fmuladd.f32(float %i8, float %i104, float %accum.sroa.94.0)
+ %i233 = tail call float @llvm.fmuladd.f32(float %i9, float %i105, float %i232)
+ %i234 = tail call float @llvm.fmuladd.f32(float %i10, float %i106, float %i233)
+ %i235 = tail call float @llvm.fmuladd.f32(float %i11, float %i107, float %i234)
+ %i236 = tail call float @llvm.fmuladd.f32(float %i8, float %i108, float %accum.sroa.98.0)
+ %i237 = tail call float @llvm.fmuladd.f32(float %i9, float %i109, float %i236)
+ %i238 = tail call float @llvm.fmuladd.f32(float %i10, float %i110, float %i237)
+ %i239 = tail call float @llvm.fmuladd.f32(float %i11, float %i111, float %i238)
+ %i240 = tail call float @llvm.fmuladd.f32(float %i8, float %i112, float %accum.sroa.102.0)
+ %i241 = tail call float @llvm.fmuladd.f32(float %i9, float %i113, float %i240)
+ %i242 = tail call float @llvm.fmuladd.f32(float %i10, float %i114, float %i241)
+ %i243 = tail call float @llvm.fmuladd.f32(float %i11, float %i115, float %i242)
+ %i244 = tail call float @llvm.fmuladd.f32(float %i8, float %i116, float %accum.sroa.106.0)
+ %i245 = tail call float @llvm.fmuladd.f32(float %i9, float %i117, float %i244)
+ %i246 = tail call float @llvm.fmuladd.f32(float %i10, float %i118, float %i245)
+ %i247 = tail call float @llvm.fmuladd.f32(float %i11, float %i119, float %i246)
+ %i248 = tail call float @llvm.fmuladd.f32(float %i8, float %i120, float %accum.sroa.110.0)
+ %i249 = tail call float @llvm.fmuladd.f32(float %i9, float %i121, float %i248)
+ %i250 = tail call float @llvm.fmuladd.f32(float %i10, float %i122, float %i249)
+ %i251 = tail call float @llvm.fmuladd.f32(float %i11, float %i123, float %i250)
+ %i252 = tail call float @llvm.fmuladd.f32(float %i8, float %i124, float %accum.sroa.114.0)
+ %i253 = tail call float @llvm.fmuladd.f32(float %i9, float %i125, float %i252)
+ %i254 = tail call float @llvm.fmuladd.f32(float %i10, float %i126, float %i253)
+ %i255 = tail call float @llvm.fmuladd.f32(float %i11, float %i127, float %i254)
+ %i256 = tail call float @llvm.fmuladd.f32(float %i8, float %i128, float %accum.sroa.118.0)
+ %i257 = tail call float @llvm.fmuladd.f32(float %i9, float %i129, float %i256)
+ %i258 = tail call float @llvm.fmuladd.f32(float %i10, float %i130, float %i257)
+ %i259 = tail call float @llvm.fmuladd.f32(float %i11, float %i131, float %i258)
+ %i260 = tail call float @llvm.fmuladd.f32(float %i8, float %i132, float %accum.sroa.122.0)
+ %i261 = tail call float @llvm.fmuladd.f32(float %i9, float %i133, float %i260)
+ %i262 = tail call float @llvm.fmuladd.f32(float %i10, float %i134, float %i261)
+ %i263 = tail call float @llvm.fmuladd.f32(float %i11, float %i135, float %i262)
+ %i264 = tail call float @llvm.fmuladd.f32(float %i8, float %i136, float %accum.sroa.126.0)
+ %i265 = tail call float @llvm.fmuladd.f32(float %i9, float %i137, float %i264)
+ %i266 = tail call float @llvm.fmuladd.f32(float %i10, float %i138, float %i265)
+ %i267 = tail call float @llvm.fmuladd.f32(float %i11, float %i139, float %i266)
+ %add.ptr74 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 4096
+ %inc116 = add nuw nsw i32 %ci.0286, 1
+ %exitcond.not = icmp eq i32 %inc116, 512
+ br i1 %exitcond.not, label %for.cond.cleanup26, label %for.cond28.preheader
+
+for.cond.cleanup26: ; preds = %for.cond28.preheader
+ %mul119 = shl nuw nsw i32 undef, 1
+ %mul120 = mul i32 %div, 200704
+ %mul121 = mul i32 undef, 6272
+ %add122 = add i32 %mul120, %mul121
+ %mul123 = mul nuw nsw i32 undef, 28
+ %add124 = add i32 %add122, %mul123
+ %add126 = add i32 %add124, %mul119
+ %idx.ext127 = zext i32 %add126 to i64
+ %add.ptr128 = getelementptr inbounds float, ptr addrspace(1) %out_ptr, i64 %idx.ext127
+ store float %i143, ptr addrspace(1) %add.ptr128, align 4
+ %add.ptr184 = getelementptr inbounds float, ptr addrspace(1) %add.ptr128, i64 196
+ store float %i147, ptr addrspace(1) %add.ptr184, align 4
+ %add.ptr167.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184, i64 14
+ store float 0.000000e+00, ptr addrspace(1) %add.ptr167.1, align 4
+ %add.ptr175.1.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr167.1, i64 1
+ store float 0.000000e+00, ptr addrspace(1) %add.ptr175.1.1, align 4
+ %add.ptr184.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184, i64 196
+ store float %i151, ptr addrspace(1) %add.ptr184.1, align 4
+ %add.ptr184.2 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.1, i64 196
+ store float %i155, ptr addrspace(1) %add.ptr184.2, align 4
+ %add.ptr184.3 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.2, i64 196
+ store float %i159, ptr addrspace(1) %add.ptr184.3, align 4
+ %add.ptr184.4 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.3, i64 196
+ store float %i163, ptr addrspace(1) %add.ptr184.4, align 4
+ %add.ptr154.5 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.4, i64 1
+ store float 0.000000e+00, ptr addrspace(1) %add.ptr154.5, align 4
+ %add.ptr184.5 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.4, i64 196
+ store float %i167, ptr addrspace(1) %add.ptr184.5, align 4
+ %add.ptr154.6 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.5, i64 1
+ store float 0.000000e+00, ptr addrspace(1) %add.ptr154.6, align 4
+ %add.ptr184.6 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.5, i64 196
+ store float %i171, ptr addrspace(1) %add.ptr184.6, align 4
+ %add.ptr184.7 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.6, i64 196
+ store float %i175, ptr addrspace(1) %add.ptr184.7, align 4
+ %add.ptr167.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.7, i64 14
+ store float 0.000000e+00, ptr addrspace(1) %add.ptr167.8, align 4
+ %add.ptr175.1.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr167.8, i64 1
+ store float 0.000000e+00, ptr addrspace(1) %add.ptr175.1.8, align 4
+ %add.ptr184.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.7, i64 196
+ store float %i179, ptr addrspace(1) %add.ptr184.8, align 4
+ %add.ptr184.9 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.8, i64 196
+ store float %i183, ptr addrspace(1) %add.ptr184.9, align 4
+ %add.ptr184.10 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.9, i64 196
+ store float %i187, ptr addrspace(1) %add.ptr184.10, align 4
+ %add.ptr184.11 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.10, i64 196
+ store float %i191, ptr addrspace(1) %add.ptr184.11, align 4
+ %add.ptr184.12 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.11, i64 196
+ store float %i195, ptr addrspace(1) %add.ptr184.12, align 4
+ %add.ptr184.13 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.12, i64 196
+ store float %i199, ptr addrspace(1) %add.ptr184.13, align 4
+ %add.ptr184.14 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.13, i64 196
+ store float %i203, ptr addrspace(1) %add.ptr184.14, align 4
+ %add.ptr184.15 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.14, i64 196
+ store float %i207, ptr addrspace(1) %add.ptr184.15, align 4
+ %add.ptr184.16 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.15, i64 196
+ store float %i211, ptr addrspace(1) %add.ptr184.16, align 4
+ %add.ptr184.17 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.16, i64 196
+ store float %i215, ptr addrspace(1) %add.ptr184.17, align 4
+ %add.ptr184.18 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.17, i64 196
+ store float %i219, ptr addrspace(1) %add.ptr184.18, align 4
+ %add.ptr184.19 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.18, i64 196
+ store float %i223, ptr addrspace(1) %add.ptr184.19, align 4
+ %add.ptr184.20 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.19, i64 196
+ store float %i227, ptr addrspace(1) %add.ptr184.20, align 4
+ %add.ptr184.21 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.20, i64 196
+ store float %i231, ptr addrspace(1) %add.ptr184.21, align 4
+ %add.ptr184.22 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.21, i64 196
+ store float %i235, ptr addrspace(1) %add.ptr184.22, align 4
+ %add.ptr184.23 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.22, i64 196
+ store float %i239, ptr addrspace(1) %add.ptr184.23, align 4
+ %add.ptr184.24 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.23, i64 196
+ store float %i243, ptr addrspace(1) %add.ptr184.24, align 4
+ %add.ptr184.25 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.24, i64 196
+ store float %i247, ptr addrspace(1) %add.ptr184.25, align 4
+ %add.ptr184.26 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.25, i64 196
+ store float %i251, ptr addrspace(1) %add.ptr184.26, align 4
+ %add.ptr184.27 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.26, i64 196
+ store float %i255, ptr addrspace(1) %add.ptr184.27, align 4
+ %add.ptr184.28 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.27, i64 196
+ store float %i259, ptr addrspace(1) %add.ptr184.28, align 4
+ %add.ptr184.29 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.28, i64 196
+ store float %i263, ptr addrspace(1) %add.ptr184.29, align 4
+ %add.ptr184.30 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.29, i64 196
+ store float %i267, ptr addrspace(1) %add.ptr184.30, align 4
+ ret void
+}
+
+
+
+declare float @llvm.fmuladd.f32(float, float, float) #2
+declare i32 @llvm.amdgcn.workitem.id.x() #3
+declare i32 @llvm.amdgcn.workgroup.id.x() #3
+declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3
+
+!0 = !{i32 1, i32 2, i32 1, i32 0}
+!1 = !{!"none", !"none", !"none", !"none"}
+!2 = !{!"ptr", !"ptr", !"ptr", !"float"}
+!3 = !{!"restrict const", !"restrict const", !"restrict", !""}
+!4 = !{i32 256, i32 1, i32 1}
+!5 = !{i32 0, i32 1024}
+
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1" }
+attributes #1 = { nounwind "amdgpu-num-vgpr"="64" }
+attributes #2 = { nofree nosync nounwind readnone speculatable willreturn }
+attributes #3 = { nounwind readnone speculatable willreturn }
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
index 14bb4310c619ea..34d203e0de2ffa 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
@@ -1,4 +1,5 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
--- |
define amdgpu_kernel void @no_sched_metric_due_to_spills() #0 {
@@ -11,6 +12,20 @@
# GCN-LABEL: name: no_sched_metric_due_to_spills
# GCN-NOT: SI_SPILL_
# GCN: S_ENDPGM
+
+# GCN-GCNTRACKER-LABEL: name: no_sched_metric_due_to_spills
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: S_ENDPGM
+
+# When using the GCN Trackers, the scheduler is able to acieve desired occupancy without running high-RP-reschedule stage. However, the RP is still high,
+# and RA is unable to allocate without spills. By running the high-RP-reschedule schedule we would have furhter decreased RP, which provides increased
+# flexibility for RA.
+
---
name: no_sched_metric_due_to_spills
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
index 94815558bf3d6d..71f8d91874f04f 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
@@ -1,16 +1,24 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=OCC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs < %s | FileCheck --check-prefix=OCC-GCNTRACKER %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs -amdgpu-schedule-relaxed-occupancy=true < %s | FileCheck --check-prefix=RELAX %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -amdgpu-schedule-relaxed-occupancy=true < %s | FileCheck --check-prefix=RELAX-GCNTRACKER %s
; Using -amgpu-schedule-relaxed-occupancy allows scheduler to produce better ILP by further relaxing occupancy target
-; GCN-LABEL: {{^}}load_fma_store:
+; CHECK-LABEL: {{^}}load_fma_store:
; OCC: NumVgprs: 32
+; OCC-GCNTRACKER: NumVgprs: 24
; RELAX: NumVgprs: 64
+; RELAX-GCNTRACKER: NumVgprs: 60
; OCC: NumVGPRsForWavesPerEU: 32
+; OCC-GCNTRACKER: NumVGPRsForWavesPerEU: 24
; RELAX: NumVGPRsForWavesPerEU: 64
+; RELAX-GCNTRACKER: NumVGPRsForWavesPerEU: 60
; OCC: Occupancy: 8
+; OCC-GCNTRACKER: Occupancy: 8
; RELAX: Occupancy: 4
+; RELAX-GCNTRACKER: Occupancy: 4
define amdgpu_kernel void @load_fma_store(ptr addrspace(3) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) #1 {
bb:
More information about the llvm-commits
mailing list