[llvm] [AMDGPU] Optionally Use GCNRPTrackers during scheduling (PR #93090)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Sun Oct 6 15:45:09 PDT 2024
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/93090
>From 434b5983e22da95a4a1648316c576f4af16ae02f Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 21 May 2024 12:55:07 -0700
Subject: [PATCH 01/24] [AMDGPU] NFC: Add BBLiveOutMap & LiveOut Cache
Change-Id: I63cfd44e635cc4bee0e6780ca43b692c46e940b7
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 58 ++++++++++++++++++---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 42 ++++++++++++++-
2 files changed, 91 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index d6958d9055fade..0a1a72c230db85 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -58,6 +58,11 @@ static cl::opt<bool>
"Wave Limited (amdgpu-limit-wave-threshold)."),
cl::init(false));
+static cl::opt<bool> GCNTrackers(
+ "amdgpu-use-amdgpu-trackers", cl::Hidden,
+ cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
+ cl::init(false));
+
const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -571,7 +576,8 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(
MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
: ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
MFI(*MF.getInfo<SIMachineFunctionInfo>()),
- StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) {
+ StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
+ RegionLiveOuts(this, /*IsLiveOut=*/true) {
LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
if (RelaxedOcc) {
@@ -613,6 +619,14 @@ GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
return RPTracker.moveMaxPressure();
}
+static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
+ MachineBasicBlock::iterator RegionEnd) {
+ auto REnd = RegionEnd == RegionBegin->getParent()->end()
+ ? std::prev(RegionEnd)
+ : RegionEnd;
+ return &*skipDebugInstructionsBackward(REnd, RegionBegin);
+}
+
void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
const MachineBasicBlock *MBB) {
GCNDownwardRPTracker RPTracker(*LIS);
@@ -687,20 +701,45 @@ void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
}
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
-GCNScheduleDAGMILive::getBBLiveInMap() const {
+GCNScheduleDAGMILive::getRegionLiveInMap() const {
assert(!Regions.empty());
- std::vector<MachineInstr *> BBStarters;
- BBStarters.reserve(Regions.size());
+ std::vector<MachineInstr *> RegionFirstMIs;
+ RegionFirstMIs.reserve(Regions.size());
auto I = Regions.rbegin(), E = Regions.rend();
auto *BB = I->first->getParent();
do {
auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
- BBStarters.push_back(MI);
+ RegionFirstMIs.push_back(MI);
do {
++I;
} while (I != E && I->first->getParent() == BB);
} while (I != E);
- return getLiveRegMap(BBStarters, false /*After*/, *LIS);
+ return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
+}
+
+DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
+GCNScheduleDAGMILive::getRegionLiveOutMap() const {
+ assert(!Regions.empty());
+ std::vector<MachineInstr *> RegionLastMIs;
+ RegionLastMIs.reserve(Regions.size());
+ for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
+ RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
+
+ return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
+}
+
+void RegionPressureMap::buildLiveRegMap() {
+ IdxToInstruction.clear();
+
+ BBLiveRegMap =
+ IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
+ for (unsigned I = 0; I < DAG->Regions.size(); I++) {
+ MachineInstr *RegionKey =
+ IsLiveOut
+ ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second)
+ : &*DAG->Regions[I].first;
+ IdxToInstruction[I] = RegionKey;
+ }
}
void GCNScheduleDAGMILive::finalizeSchedule() {
@@ -726,8 +765,11 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
void GCNScheduleDAGMILive::runSchedStages() {
LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
- if (!Regions.empty())
- BBLiveInMap = getBBLiveInMap();
+ if (!Regions.empty()) {
+ BBLiveInMap = getRegionLiveInMap();
+ if (GCNTrackers)
+ RegionLiveOuts.buildLiveRegMap();
+ }
GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
while (S.advanceStage()) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index f0aea2bc4ab865..c402fb1ef373c9 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -163,6 +163,32 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) {
return OS;
}
+class GCNScheduleDAGMILive;
+class RegionPressureMap {
+ GCNScheduleDAGMILive *DAG;
+ // The live in/out pressure as indexed by the first or last MI in the region
+ // before scheduling.
+ DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveRegMap;
+ // The mapping of RegionIDx to key instruction
+ DenseMap<unsigned, MachineInstr *> IdxToInstruction;
+ // Whether we are calculating LiveOuts or LiveIns
+ bool IsLiveOut;
+
+public:
+ RegionPressureMap() {}
+ RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut)
+ : DAG(GCNDAG), IsLiveOut(LiveOut) {}
+ // Build the Instr->LiveReg and RegionIdx->Instr maps
+ void buildLiveRegMap();
+
+ // Retrieve the LiveReg for a given RegionIdx
+ GCNRPTracker::LiveRegSet &getLiveRegsForRegionIdx(unsigned RegionIdx) {
+ assert(IdxToInstruction.find(RegionIdx) != IdxToInstruction.end());
+ MachineInstr *Key = IdxToInstruction[RegionIdx];
+ return BBLiveRegMap[Key];
+ }
+};
+
class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
friend class GCNSchedStage;
friend class OccInitialScheduleStage;
@@ -170,6 +196,7 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
friend class ClusteredLowOccStage;
friend class PreRARematStage;
friend class ILPInitialScheduleStage;
+ friend class RegionPressureMap;
const GCNSubtarget &ST;
@@ -211,9 +238,22 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// Temporary basic block live-in cache.
DenseMap<const MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBLiveIns;
+ // The map of the initial first region instruction to region live in registers
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
- DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
+ // Calculate the map of the initial first region instruction to region live in
+ // registers
+ DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getRegionLiveInMap() const;
+
+ // Calculate the map of the initial last region instruction to region live out
+ // registers
+ DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
+ getRegionLiveOutMap() const;
+
+ // The live out registers per region. These are internally stored as a map of
+ // the initial last region instruction to region live out registers, but can
+ // be retreived with the regionIdx by calls to getLiveRegsForRegionIdx.
+ RegionPressureMap RegionLiveOuts;
// Return current region pressure.
GCNRegPressure getRealRegPressure(unsigned RegionIdx) const;
>From 6a57763122b140007aadc27ec9108762f5de350f Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 21 May 2024 13:34:59 -0700
Subject: [PATCH 02/24] [AMDGPU] NFC: Provide RPTracker interface for external
iterators
Change-Id: I79b54722e6e858961486248d94766c3f3c161160
---
llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 284 ++++++++++++++++++++--
llvm/lib/Target/AMDGPU/GCNRegPressure.h | 95 ++++++--
2 files changed, 330 insertions(+), 49 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index cb0624f11592d2..d1a50adc1918cf 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -289,6 +289,72 @@ collectVirtualRegUses(SmallVectorImpl<RegisterMaskPair> &RegMaskPairs,
}
}
+static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
+ Register RegUnit) {
+ auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+ return Other.RegUnit == RegUnit;
+ });
+ if (I == RegUnits.end())
+ return LaneBitmask::getNone();
+ return I->LaneMask;
+}
+
+static LaneBitmask
+getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
+ bool TrackLaneMasks, Register RegUnit, SlotIndex Pos,
+ LaneBitmask SafeDefault,
+ bool (*Property)(const LiveRange &LR, SlotIndex Pos)) {
+ if (RegUnit.isVirtual()) {
+ const LiveInterval &LI = LIS.getInterval(RegUnit);
+ LaneBitmask Result;
+ if (TrackLaneMasks && LI.hasSubRanges()) {
+ for (const LiveInterval::SubRange &SR : LI.subranges()) {
+ if (Property(SR, Pos))
+ Result |= SR.LaneMask;
+ }
+ } else if (Property(LI, Pos)) {
+ Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit)
+ : LaneBitmask::getAll();
+ }
+
+ return Result;
+ } else {
+ const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
+ // Be prepared for missing liveranges: We usually do not compute liveranges
+ // for physical registers on targets with many registers (GPUs).
+ if (LR == nullptr)
+ return SafeDefault;
+ return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone();
+ }
+}
+
+/// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
+/// The query starts with a lane bitmask which gets lanes/bits removed for every
+/// use we find.
+static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
+ SlotIndex PriorUseIdx, SlotIndex NextUseIdx,
+ const MachineRegisterInfo &MRI,
+ const LiveIntervals *LIS,
+ bool Upward = false) {
+ const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+ for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+ if (MO.isUndef())
+ continue;
+ const MachineInstr *MI = MO.getParent();
+ SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
+ bool InRange = Upward ? (InstSlot > PriorUseIdx && InstSlot <= NextUseIdx)
+ : (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx);
+ if (InRange) {
+ unsigned SubRegIdx = MO.getSubReg();
+ LaneBitmask UseMask = TRI.getSubRegIndexLaneMask(SubRegIdx);
+ LastUseMask &= ~UseMask;
+ if (LastUseMask.none())
+ return LaneBitmask::getNone();
+ }
+ }
+ return LastUseMask;
+}
+
///////////////////////////////////////////////////////////////////////////////
// GCNRPTracker
@@ -344,17 +410,47 @@ void GCNRPTracker::reset(const MachineInstr &MI,
MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
}
-////////////////////////////////////////////////////////////////////////////////
-// GCNUpwardRPTracker
-
-void GCNUpwardRPTracker::reset(const MachineRegisterInfo &MRI_,
- const LiveRegSet &LiveRegs_) {
+void GCNRPTracker::reset(const MachineRegisterInfo &MRI_,
+ const LiveRegSet &LiveRegs_) {
MRI = &MRI_;
LiveRegs = LiveRegs_;
LastTrackedMI = nullptr;
MaxPressure = CurPressure = getRegPressure(MRI_, LiveRegs_);
}
+void GCNRPTracker::bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs) {
+ for (const RegisterMaskPair &P : DeadDefs) {
+ Register Reg = P.RegUnit;
+ if (!Reg.isVirtual())
+ continue;
+ LaneBitmask LiveMask = LiveRegs[Reg];
+ LaneBitmask BumpedMask = LiveMask | P.LaneMask;
+ CurPressure.inc(Reg, LiveMask, BumpedMask, *MRI);
+ }
+ MaxPressure = max(MaxPressure, CurPressure);
+ for (const RegisterMaskPair &P : DeadDefs) {
+ Register Reg = P.RegUnit;
+ if (!Reg.isVirtual())
+ continue;
+ LaneBitmask LiveMask = LiveRegs[Reg];
+ LaneBitmask BumpedMask = LiveMask | P.LaneMask;
+ CurPressure.inc(Reg, BumpedMask, LiveMask, *MRI);
+ }
+}
+
+LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit,
+ SlotIndex Pos) const {
+ return getLanesWithProperty(
+ LIS, *MRI, true, RegUnit, Pos.getBaseIndex(), LaneBitmask::getNone(),
+ [](const LiveRange &LR, SlotIndex Pos) {
+ const LiveRange::Segment *S = LR.getSegmentContaining(Pos);
+ return S != nullptr && S->end == Pos.getRegSlot();
+ });
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// GCNUpwardRPTracker
+
void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
assert(MRI && "call reset first");
@@ -415,6 +511,63 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
assert(CurPressure == getRegPressure(*MRI, LiveRegs));
}
+void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI) {
+ assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
+
+ SlotIndex SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot();
+
+ // Account for register pressure similar to RegPressureTracker::recede().
+ RegisterOperands RegOpers;
+ const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+ RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/true);
+ assert(RegOpers.DeadDefs.empty());
+ RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
+ RegOpers.detectDeadDefs(*MI, LIS);
+
+ // Boost max pressure for all dead defs together.
+ // Since CurrSetPressure and MaxSetPressure
+ bumpDeadDefs(RegOpers.DeadDefs);
+
+ // Kill liveness at live defs.
+ for (const RegisterMaskPair &P : RegOpers.Defs) {
+ Register Reg = P.RegUnit;
+ if (!Reg.isVirtual())
+ continue;
+ LaneBitmask LiveAfter = LiveRegs[Reg];
+ LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, Reg);
+ LaneBitmask DefLanes = P.LaneMask;
+ LaneBitmask LiveBefore = (LiveAfter & ~DefLanes) | UseLanes;
+
+ // There may be parts of the register that were dead before the
+ // instruction, but became live afterwards. Similarly, some parts
+ // may have been killed in this instruction.
+ CurPressure.inc(Reg, LiveAfter, LiveAfter & LiveBefore, *MRI);
+ CurPressure.inc(Reg, LiveAfter, ~LiveAfter & LiveBefore, *MRI);
+ MaxPressure = max(MaxPressure, CurPressure);
+ }
+ // Generate liveness for uses.
+ for (const RegisterMaskPair &P : RegOpers.Uses) {
+ Register Reg = P.RegUnit;
+ if (!Reg.isVirtual())
+ continue;
+ // If this register was also in a def operand, we've handled it
+ // with defs.
+ if (getRegLanes(RegOpers.Defs, Reg).any())
+ continue;
+ LaneBitmask LiveAfter = LiveRegs[Reg];
+ SlotIndex CurrIdx =
+ LastTrackedMI ? LIS.getInstructionIndex(*LastTrackedMI).getRegSlot()
+ : LIS.getMBBEndIdx(MI->getParent());
+ ;
+ LaneBitmask LastUseMask =
+ findUseBetween(Reg, P.LaneMask, SlotIdx, CurrIdx, *MRI, &LIS, true);
+ LastUseMask &= ~LiveAfter;
+ LaneBitmask LiveBefore = (LiveAfter | LastUseMask);
+ CurPressure.inc(Reg, LiveAfter, LiveBefore, *MRI);
+ }
+ MaxPressure = max(MaxPressure, CurPressure);
+}
+
////////////////////////////////////////////////////////////////////////////////
// GCNDownwardRPTracker
@@ -431,28 +584,44 @@ bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
return true;
}
-bool GCNDownwardRPTracker::advanceBeforeNext() {
+bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI,
+ bool UseInternalIterator,
+ LiveIntervals *TheLIS) {
assert(MRI && "call reset first");
- if (!LastTrackedMI)
- return NextMI == MBBEnd;
-
- assert(NextMI == MBBEnd || !NextMI->isDebugInstr());
+ SlotIndex SI;
+ LiveIntervals *CurrLIS;
+ MachineInstr *CurrMI;
+ if (UseInternalIterator) {
+ if (!LastTrackedMI)
+ return NextMI == MBBEnd;
+
+ assert(NextMI == MBBEnd || !NextMI->isDebugInstr());
+ CurrLIS = const_cast<LiveIntervals *>(&LIS);
+ CurrMI = const_cast<MachineInstr *>(LastTrackedMI);
+
+ SI = NextMI == MBBEnd
+ ? CurrLIS->getInstructionIndex(*LastTrackedMI).getDeadSlot()
+ : CurrLIS->getInstructionIndex(*NextMI).getBaseIndex();
+ } else { //! UseInternalIterator
+ CurrLIS = TheLIS;
+ SI = CurrLIS->getInstructionIndex(*MI).getBaseIndex();
+ CurrMI = MI;
+ }
- SlotIndex SI = NextMI == MBBEnd
- ? LIS.getInstructionIndex(*LastTrackedMI).getDeadSlot()
- : LIS.getInstructionIndex(*NextMI).getBaseIndex();
assert(SI.isValid());
// Remove dead registers or mask bits.
SmallSet<Register, 8> SeenRegs;
- for (auto &MO : LastTrackedMI->operands()) {
+ for (auto &MO : CurrMI->operands()) {
if (!MO.isReg() || !MO.getReg().isVirtual())
continue;
if (MO.isUse() && !MO.readsReg())
continue;
+ if (!UseInternalIterator && MO.isDef())
+ continue;
if (!SeenRegs.insert(MO.getReg()).second)
continue;
- const LiveInterval &LI = LIS.getInterval(MO.getReg());
+ const LiveInterval &LI = CurrLIS->getInterval(MO.getReg());
if (LI.hasSubRanges()) {
auto It = LiveRegs.end();
for (const auto &S : LI.subranges()) {
@@ -482,15 +651,22 @@ bool GCNDownwardRPTracker::advanceBeforeNext() {
LastTrackedMI = nullptr;
- return NextMI == MBBEnd;
+ return UseInternalIterator && (NextMI == MBBEnd);
}
-void GCNDownwardRPTracker::advanceToNext() {
- LastTrackedMI = &*NextMI++;
- NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+void GCNDownwardRPTracker::advanceToNext(MachineInstr *MI,
+ bool UseInternalIterator) {
+ if (UseInternalIterator) {
+ LastTrackedMI = &*NextMI++;
+ NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+ } else {
+ LastTrackedMI = MI;
+ }
+
+ MachineInstr *CurrMI = const_cast<MachineInstr *>(LastTrackedMI);
// Add new registers or mask bits.
- for (const auto &MO : LastTrackedMI->all_defs()) {
+ for (const auto &MO : CurrMI->all_defs()) {
Register Reg = MO.getReg();
if (!Reg.isVirtual())
continue;
@@ -503,11 +679,12 @@ void GCNDownwardRPTracker::advanceToNext() {
MaxPressure = max(MaxPressure, CurPressure);
}
-bool GCNDownwardRPTracker::advance() {
- if (NextMI == MBBEnd)
+bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator,
+ LiveIntervals *TheLIS) {
+ if (UseInternalIterator && NextMI == MBBEnd)
return false;
- advanceBeforeNext();
- advanceToNext();
+ advanceBeforeNext(MI, UseInternalIterator, TheLIS);
+ advanceToNext(MI, UseInternalIterator);
return true;
}
@@ -549,6 +726,65 @@ Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
});
}
+void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI) {
+ assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
+
+ SlotIndex SlotIdx;
+ SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot();
+
+ // Account for register pressure similar to RegPressureTracker::recede().
+ RegisterOperands RegOpers;
+ const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+ RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/false);
+ RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
+
+ for (const RegisterMaskPair &Use : RegOpers.Uses) {
+ Register Reg = Use.RegUnit;
+ if (!Reg.isVirtual())
+ continue;
+ LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
+ if (LastUseMask.none())
+ continue;
+ // The LastUseMask is queried from the liveness information of instruction
+ // which may be further down the schedule. Some lanes may actually not be
+ // last uses for the current position.
+ // FIXME: allow the caller to pass in the list of vreg uses that remain
+ // to be bottom-scheduled to avoid searching uses at each query.
+ SlotIndex CurrIdx;
+ const MachineBasicBlock *MBB = MI->getParent();
+ MachineBasicBlock::const_iterator IdxPos = skipDebugInstructionsForward(
+ LastTrackedMI ? LastTrackedMI : MBB->begin(), MBB->end());
+ if (IdxPos == MBB->end()) {
+ CurrIdx = LIS.getMBBEndIdx(MBB);
+ } else {
+ CurrIdx = LIS.getInstructionIndex(*IdxPos).getRegSlot();
+ }
+
+ LastUseMask =
+ findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, &LIS);
+ if (LastUseMask.none())
+ continue;
+
+ LaneBitmask LiveMask = LiveRegs[Reg];
+ LaneBitmask NewMask = LiveMask & ~LastUseMask;
+ CurPressure.inc(Reg, LiveMask, NewMask, *MRI);
+ }
+
+ // Generate liveness for defs.
+ for (const RegisterMaskPair &Def : RegOpers.Defs) {
+ Register Reg = Def.RegUnit;
+ if (!Reg.isVirtual())
+ continue;
+ LaneBitmask LiveMask = LiveRegs[Reg];
+ LaneBitmask NewMask = LiveMask | Def.LaneMask;
+ CurPressure.inc(Reg, LiveMask, NewMask, *MRI);
+ }
+ MaxPressure = max(MaxPressure, CurPressure);
+
+ // Boost pressure for all dead defs together.
+ bumpDeadDefs(RegOpers.DeadDefs);
+}
+
bool GCNUpwardRPTracker::isValid() const {
const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex();
const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 54dc1972d27619..a79e412ce33449 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -19,6 +19,7 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/RegisterPressure.h"
#include <algorithm>
namespace llvm {
@@ -149,6 +150,9 @@ inline GCNRegPressure operator-(const GCNRegPressure &P1,
return Diff;
}
+///////////////////////////////////////////////////////////////////////////////
+// GCNRPTracker
+
class GCNRPTracker {
public:
using LiveRegSet = DenseMap<unsigned, LaneBitmask>;
@@ -165,7 +169,14 @@ class GCNRPTracker {
void reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy,
bool After);
+ void bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs);
+
+ LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const;
+
public:
+ // reset tracker and set live register set to the specified value.
+ void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_);
+
// live regs for the current state
const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; }
@@ -182,34 +193,40 @@ class GCNRPTracker {
GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
const MachineRegisterInfo &MRI);
+////////////////////////////////////////////////////////////////////////////////
+// GCNUpwardRPTracker
+
class GCNUpwardRPTracker : public GCNRPTracker {
public:
GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
- // reset tracker and set live register set to the specified value.
- void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_);
+ using GCNRPTracker::reset;
- // reset tracker at the specified slot index.
+ /// reset tracker at the specified slot index \p SI.
void reset(const MachineRegisterInfo &MRI, SlotIndex SI) {
- reset(MRI, llvm::getLiveRegs(SI, LIS, MRI));
+ GCNRPTracker::reset(MRI, llvm::getLiveRegs(SI, LIS, MRI));
}
- // reset tracker to the end of the MBB.
+ /// reset tracker to the end of the \p MBB.
void reset(const MachineBasicBlock &MBB) {
reset(MBB.getParent()->getRegInfo(),
LIS.getSlotIndexes()->getMBBEndIdx(&MBB));
}
- // reset tracker to the point just after MI (in program order).
+ /// reset tracker to the point just after \p MI (in program order).
void reset(const MachineInstr &MI) {
reset(MI.getMF()->getRegInfo(), LIS.getInstructionIndex(MI).getDeadSlot());
}
- // move to the state just before the MI (in program order).
+ /// Move to the state of RP just before the \p MI . If \p UseInternalIterator
+ /// is set, also update the internal iterators. Setting \p UseInternalIterator
+ /// to false allows for an externally managed iterator / program order.
void recede(const MachineInstr &MI);
- // checks whether the tracker's state after receding MI corresponds
- // to reported by LIS.
+ void bumpUpwardPressure(const MachineInstr *MI);
+
+ /// \p returns whether the tracker's state after receding MI corresponds
+ /// to reported by LIS.
bool isValid() const;
const GCNRegPressure &getMaxPressure() const { return MaxPressure; }
@@ -223,6 +240,9 @@ class GCNUpwardRPTracker : public GCNRPTracker {
}
};
+////////////////////////////////////////////////////////////////////////////////
+// GCNDownwardRPTracker
+
class GCNDownwardRPTracker : public GCNRPTracker {
// Last position of reset or advanceBeforeNext
MachineBasicBlock::const_iterator NextMI;
@@ -232,37 +252,62 @@ class GCNDownwardRPTracker : public GCNRPTracker {
public:
GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
+ using GCNRPTracker::reset;
+
MachineBasicBlock::const_iterator getNext() const { return NextMI; }
- // Return MaxPressure and clear it.
+ /// \p return MaxPressure and clear it.
GCNRegPressure moveMaxPressure() {
auto Res = MaxPressure;
MaxPressure.clear();
return Res;
}
- // Reset tracker to the point before the MI
- // filling live regs upon this point using LIS.
- // Returns false if block is empty except debug values.
+ /// Reset tracker to the point before the \p MI
+ /// filling \p LiveRegs upon this point using LIS.
+ /// \p returns false if block is empty except debug values.
bool reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
- // Move to the state right before the next MI or after the end of MBB.
- // Returns false if reached end of the block.
- bool advanceBeforeNext();
-
- // Move to the state at the MI, advanceBeforeNext has to be called first.
- void advanceToNext();
-
- // Move to the state at the next MI. Returns false if reached end of block.
- bool advance();
-
- // Advance instructions until before End.
+ /// Move to the state right before the next MI or after the end of MBB.
+ /// \p returns false if reached end of the block.
+ /// If \p UseInternalIterator is true, then internal iterators are used and
+ /// set to process in program order. If \p UseInternalIterator is false, then
+ /// it is assumed that the tracker is using an externally managed iterator,
+ /// and advance* calls will not update the state of the iterator. In such
+ /// cases, the tracker will move to the state right before the provided \p MI
+ /// and use the provided \p TheLIS for RP calculations.
+ bool advanceBeforeNext(MachineInstr *MI = nullptr,
+ bool UseInternalIterator = true,
+ LiveIntervals *TheLIS = nullptr);
+
+ /// Move to the state at the MI, advanceBeforeNext has to be called first.
+ /// If \p UseInternalIterator is true, then internal iterators are used and
+ /// set to process in program order. If \p UseInternalIterator is false, then
+ /// it is assumed that the tracker is using an externally managed iterator,
+ /// and advance* calls will not update the state of the iterator. In such
+ /// cases, the tracker will move to the state at the provided \p MI .
+ void advanceToNext(MachineInstr *MI = nullptr,
+ bool UseInternalIterator = true);
+
+ /// Move to the state at the next MI. \p returns false if reached end of
+ /// block. If \p UseInternalIterator is true, then internal iterators are used
+ /// and set to process in program order. If \p UseInternalIterator is false,
+ /// then it is assumed that the tracker is using an externally managed
+ /// iterator, and advance* calls will not update the state of the iterator. In
+ /// such cases, the tracker will move to the state right before the provided
+ /// \p MI and use the provided \p TheLIS for RP calculations.
+ bool advance(MachineInstr *MI = nullptr, bool UseInternalIterator = true,
+ LiveIntervals *TheLIS = nullptr);
+
+ /// Advance instructions until before \p End.
bool advance(MachineBasicBlock::const_iterator End);
- // Reset to Begin and advance to End.
+ /// Reset to \p Begin and advance to \p End.
bool advance(MachineBasicBlock::const_iterator Begin,
MachineBasicBlock::const_iterator End,
const LiveRegSet *LiveRegsCopy = nullptr);
+
+ void bumpDownwardPressure(const MachineInstr *MI);
};
LaneBitmask getLiveLaneMask(unsigned Reg,
>From b625761fea8816058228f63c05b563723390f62c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 21 May 2024 18:04:25 -0700
Subject: [PATCH 03/24] [AMDGPU] Optionally Use AMDGPU RPTrackers during
scheduling
Change-Id: I6ae56149c1eb49ea85362267174cc6274c416330
---
.../Target/AMDGPU/GCNIterativeScheduler.cpp | 2 +-
llvm/lib/Target/AMDGPU/GCNRegPressure.h | 1 -
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 90 ++++++++++++++++---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 19 +++-
4 files changed, 96 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 13504508e2fb2e..9b1db3241e4327 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -480,7 +480,7 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
"target occupancy = "
<< TgtOcc << '\n');
- GCNMaxOccupancySchedStrategy LStrgy(Context);
+ GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler*/ true);
unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
for (int I = 0; I < NumPasses; ++I) {
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index a79e412ce33449..f78e4d7da0a1dd 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -176,7 +176,6 @@ class GCNRPTracker {
public:
// reset tracker and set live register set to the specified value.
void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_);
-
// live regs for the current state
const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; }
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 0a1a72c230db85..1e6d95d128709d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -67,6 +67,7 @@ const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
: GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
+ TheTracker(*C->LIS), TheUpwardTracker(*C->LIS),
HasHighPressure(false) {}
void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
@@ -156,14 +157,37 @@ static bool canUsePressureDiffs(const SUnit &SU) {
static void getRegisterPressures(bool AtTop,
const RegPressureTracker &RPTracker, SUnit *SU,
std::vector<unsigned> &Pressure,
- std::vector<unsigned> &MaxPressure) {
+ std::vector<unsigned> &MaxPressure,
+ GCNDownwardRPTracker &TheTracker,
+ GCNUpwardRPTracker &TheUpwardTracker,
+ ScheduleDAGMI *DAG) {
// getDownwardPressure() and getUpwardPressure() make temporary changes to
// the tracker, so we need to pass those function a non-const copy.
RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
- if (AtTop)
- TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
- else
- TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+ if (!GCNTrackers) {
+ if (AtTop)
+ TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
+ else
+ TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+ } else {
+ if (AtTop) {
+ GCNDownwardRPTracker TempTopTracker(TheTracker);
+ auto MI = SU->getInstr();
+ TempTopTracker.advance(MI, true, DAG->getLIS());
+
+ Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempTopTracker.getPressure().getSGPRNum();
+ Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = TempTopTracker.getPressure().getVGPRNum(false);
+ }
+
+ else {
+ GCNUpwardRPTracker TempBotTracker(TheUpwardTracker);
+ auto MI = SU->getInstr();
+ TempBotTracker.recede(*MI, true);
+
+ Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempBotTracker.getPressure().getSGPRNum();
+ Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = TempBotTracker.getPressure().getVGPRNum(false);
+ }
+ }
}
void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
@@ -192,8 +216,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
//
// In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
// PressureDiffs.
- if (AtTop || !canUsePressureDiffs(*SU)) {
- getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure);
+ if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
+ getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, TheTracker, TheUpwardTracker, DAG);
} else {
// Reserve 4 slots.
Pressure.resize(4, 0);
@@ -211,7 +235,11 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
#ifdef EXPENSIVE_CHECKS
std::vector<unsigned> CheckPressure, CheckMaxPressure;
+<<<<<<< HEAD
getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure);
+=======
+ getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,TheTracker,TheUpwardTracker, DAG);
+>>>>>>> 3fc6929b4a78... [AMDGPU] Optionally Use AMDGPU RPTrackers during scheduling
if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
@@ -299,8 +327,16 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
unsigned SGPRPressure = 0;
unsigned VGPRPressure = 0;
if (DAG->isTrackingPressure()) {
- SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
- VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+ SGPRPressure =
+ GCNTrackers
+ ? (Zone.isTop() ? TheTracker.getPressure().getSGPRNum()
+ : TheUpwardTracker.getPressure().getSGPRNum())
+ : Pressure[AMDGPU::RegisterPressureSets::SReg_32];
+ VGPRPressure =
+ GCNTrackers
+ ? (Zone.isTop() ? TheTracker.getPressure().getVGPRNum(false)
+ : TheUpwardTracker.getPressure().getVGPRNum(false))
+ : Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
}
ReadyQueue &Q = Zone.Available;
for (SUnit *SU : Q) {
@@ -449,6 +485,16 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
return SU;
}
+void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+ if (GCNTrackers) {
+ MachineInstr *MI = SU->getInstr();
+ IsTopNode ? (void)TheTracker.advance(MI, true, DAG->getLIS())
+ : TheUpwardTracker.recede(*MI, true);
+ }
+
+ return GenericScheduler::schedNode(SU, IsTopNode);
+}
+
GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
assert(CurrentStage && CurrentStage != SchedStages.end());
return *CurrentStage;
@@ -475,12 +521,13 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const {
}
GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
- const MachineSchedContext *C)
+ const MachineSchedContext *C, bool IsLegacyScheduler)
: GCNSchedStrategy(C) {
SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
+ GCNTrackers = GCNTrackers & !IsLegacyScheduler;
}
GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
@@ -787,6 +834,20 @@ void GCNScheduleDAGMILive::runSchedStages() {
continue;
}
+ if (GCNTrackers) {
+ GCNDownwardRPTracker *TheTracker = S.getTracker();
+ GCNUpwardRPTracker *TheUpwardTracker = S.getUpwardTracker();
+ GCNRPTracker::LiveRegSet *RegionLiveIns = &LiveIns[Stage->getRegionIdx()];
+
+ reinterpret_cast<GCNRPTracker *>(TheTracker)->reset(
+ Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
+ *RegionLiveIns);
+ reinterpret_cast<GCNRPTracker *>(TheUpwardTracker)->reset(
+ Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
+ RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx()));
+
+ }
+
ScheduleDAGMILive::schedule();
Stage->finalizeGCNRegion();
}
@@ -1057,6 +1118,7 @@ void GCNSchedStage::finalizeGCNRegion() {
void GCNSchedStage::checkScheduling() {
// Check the results of scheduling.
PressureAfter = DAG.getRealRegPressure(RegionIdx);
+
LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
@@ -1608,9 +1670,6 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
MachineInstr *MI = Entry.first;
MachineInstr *OldMI = Entry.second;
- // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
- DAG.BBLiveInMap.erase(OldMI);
-
// Remove OldMI and update LIS
Register Reg = MI->getOperand(0).getReg();
LIS->RemoveMachineInstrFromMaps(*OldMI);
@@ -1628,6 +1687,11 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
DAG.Regions = NewRegions;
DAG.RescheduleRegions = NewRescheduleRegions;
+ DAG.BBLiveInMap = DAG.getBBLiveInMap();
+
+ if (GCNTrackers)
+ DAG.RegionLiveOuts.buildLiveRegMap();
+
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index c402fb1ef373c9..8088339fbd26c2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -70,6 +70,12 @@ class GCNSchedStrategy : public GenericScheduler {
// Pointer to the current SchedStageID.
SmallVectorImpl<GCNSchedStageID>::iterator CurrentStage = nullptr;
+ // GCN RP Tracker for top-down scheduling
+ mutable GCNDownwardRPTracker TheTracker;
+
+ // GCN RP Tracker for botttom-up scheduling
+ mutable GCNUpwardRPTracker TheUpwardTracker;
+
public:
// schedule() have seen register pressure over the critical limits and had to
// track register pressure for actual scheduling heuristics.
@@ -102,6 +108,8 @@ class GCNSchedStrategy : public GenericScheduler {
SUnit *pickNode(bool &IsTopNode) override;
+ void schedNode(SUnit *SU, bool IsTopNode) override;
+
void initialize(ScheduleDAGMI *DAG) override;
unsigned getTargetOccupancy() { return TargetOccupancy; }
@@ -116,13 +124,19 @@ class GCNSchedStrategy : public GenericScheduler {
bool hasNextStage() const;
GCNSchedStageID getNextStage() const;
+
+ GCNDownwardRPTracker *getTracker() { return &TheTracker; }
+
+ GCNUpwardRPTracker *getUpwardTracker() { return &TheUpwardTracker; }
+
};
/// The goal of this scheduling strategy is to maximize kernel occupancy (i.e.
/// maximum number of waves per simd).
class GCNMaxOccupancySchedStrategy final : public GCNSchedStrategy {
public:
- GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
+ GCNMaxOccupancySchedStrategy(const MachineSchedContext *C,
+ bool IsLegacyScheduler = false);
};
/// The goal of this scheduling strategy is to maximize ILP for a single wave
@@ -350,6 +364,9 @@ class GCNSchedStage {
bool isRegionWithExcessRP() const {
return DAG.RegionsWithExcessRP[RegionIdx];
}
+
+ // The region number this stage is currently working on
+ unsigned getRegionIdx() { return RegionIdx; }
// Returns true if the new schedule may result in more spilling.
bool mayCauseSpilling(unsigned WavesAfter);
>From 015cf529e7dee05570bee20bf1f069fbf36ec8f0 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 14 Jun 2024 14:46:28 -0700
Subject: [PATCH 04/24] Formatting
Change-Id: I1cb0a88e94f4156da6118fcd3724556939351c6d
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 46 +++++++++++----------
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 3 +-
2 files changed, 25 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 1e6d95d128709d..a6115afe0f03ce 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -66,9 +66,8 @@ static cl::opt<bool> GCNTrackers(
const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
- : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
- TheTracker(*C->LIS), TheUpwardTracker(*C->LIS),
- HasHighPressure(false) {}
+ : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), TheTracker(*C->LIS),
+ TheUpwardTracker(*C->LIS), HasHighPressure(false) {}
void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG);
@@ -175,8 +174,10 @@ static void getRegisterPressures(bool AtTop,
auto MI = SU->getInstr();
TempTopTracker.advance(MI, true, DAG->getLIS());
- Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempTopTracker.getPressure().getSGPRNum();
- Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = TempTopTracker.getPressure().getVGPRNum(false);
+ Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
+ TempTopTracker.getPressure().getSGPRNum();
+ Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
+ TempTopTracker.getPressure().getVGPRNum(false);
}
else {
@@ -184,8 +185,10 @@ static void getRegisterPressures(bool AtTop,
auto MI = SU->getInstr();
TempBotTracker.recede(*MI, true);
- Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempBotTracker.getPressure().getSGPRNum();
- Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = TempBotTracker.getPressure().getVGPRNum(false);
+ Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
+ TempBotTracker.getPressure().getSGPRNum();
+ Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
+ TempBotTracker.getPressure().getVGPRNum(false);
}
}
}
@@ -217,7 +220,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
// In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
// PressureDiffs.
if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
- getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, TheTracker, TheUpwardTracker, DAG);
+ getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
+ TheTracker, TheUpwardTracker, DAG);
} else {
// Reserve 4 slots.
Pressure.resize(4, 0);
@@ -235,11 +239,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
#ifdef EXPENSIVE_CHECKS
std::vector<unsigned> CheckPressure, CheckMaxPressure;
-<<<<<<< HEAD
- getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure);
-=======
- getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,TheTracker,TheUpwardTracker, DAG);
->>>>>>> 3fc6929b4a78... [AMDGPU] Optionally Use AMDGPU RPTrackers during scheduling
+ getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
+ TheTracker, TheUpwardTracker, DAG);
if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
@@ -837,15 +838,16 @@ void GCNScheduleDAGMILive::runSchedStages() {
if (GCNTrackers) {
GCNDownwardRPTracker *TheTracker = S.getTracker();
GCNUpwardRPTracker *TheUpwardTracker = S.getUpwardTracker();
- GCNRPTracker::LiveRegSet *RegionLiveIns = &LiveIns[Stage->getRegionIdx()];
-
- reinterpret_cast<GCNRPTracker *>(TheTracker)->reset(
- Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
- *RegionLiveIns);
- reinterpret_cast<GCNRPTracker *>(TheUpwardTracker)->reset(
- Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
- RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx()));
-
+ GCNRPTracker::LiveRegSet *RegionLiveIns =
+ &LiveIns[Stage->getRegionIdx()];
+
+ reinterpret_cast<GCNRPTracker *>(TheTracker)
+ ->reset(Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
+ *RegionLiveIns);
+ reinterpret_cast<GCNRPTracker *>(TheUpwardTracker)
+ ->reset(
+ Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
+ RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx()));
}
ScheduleDAGMILive::schedule();
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 8088339fbd26c2..e8c89b2f1baf27 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -128,7 +128,6 @@ class GCNSchedStrategy : public GenericScheduler {
GCNDownwardRPTracker *getTracker() { return &TheTracker; }
GCNUpwardRPTracker *getUpwardTracker() { return &TheUpwardTracker; }
-
};
/// The goal of this scheduling strategy is to maximize kernel occupancy (i.e.
@@ -364,7 +363,7 @@ class GCNSchedStage {
bool isRegionWithExcessRP() const {
return DAG.RegionsWithExcessRP[RegionIdx];
}
-
+
// The region number this stage is currently working on
unsigned getRegionIdx() { return RegionIdx; }
>From 15d90fb9a27444800fc23e0dc3972b68e784d97e Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 27 May 2024 10:43:43 -0700
Subject: [PATCH 05/24] Actually use the iterative trackers
Change-Id: I198925f5ed91b0a49ac265e19fdbe2208139f09a
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index a6115afe0f03ce..320acbaf5b22a6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -172,7 +172,7 @@ static void getRegisterPressures(bool AtTop,
if (AtTop) {
GCNDownwardRPTracker TempTopTracker(TheTracker);
auto MI = SU->getInstr();
- TempTopTracker.advance(MI, true, DAG->getLIS());
+ TempTopTracker.advance(MI, false, DAG->getLIS());
Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
TempTopTracker.getPressure().getSGPRNum();
@@ -183,7 +183,7 @@ static void getRegisterPressures(bool AtTop,
else {
GCNUpwardRPTracker TempBotTracker(TheUpwardTracker);
auto MI = SU->getInstr();
- TempBotTracker.recede(*MI, true);
+ TempBotTracker.recede(*MI, false);
Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
TempBotTracker.getPressure().getSGPRNum();
@@ -489,8 +489,8 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
if (GCNTrackers) {
MachineInstr *MI = SU->getInstr();
- IsTopNode ? (void)TheTracker.advance(MI, true, DAG->getLIS())
- : TheUpwardTracker.recede(*MI, true);
+ IsTopNode ? (void)TheTracker.advance(MI, false, DAG->getLIS())
+ : TheUpwardTracker.recede(*MI, false);
}
return GenericScheduler::schedNode(SU, IsTopNode);
>From 9e3362c91e86ab23c7f3a94000a125cfca500032 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 28 May 2024 13:24:09 -0700
Subject: [PATCH 06/24] Review Comments
Change-Id: Ifa69110bf0a239ea14d25c0bad03215d1b018656
---
.../Target/AMDGPU/GCNIterativeScheduler.cpp | 2 +-
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 51 +++++++++----------
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 8 +--
3 files changed, 30 insertions(+), 31 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 9b1db3241e4327..e89016b0ae984e 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -480,7 +480,7 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
"target occupancy = "
<< TgtOcc << '\n');
- GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler*/ true);
+ GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler=*/ true);
unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
for (int I = 0; I < NumPasses; ++I) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 320acbaf5b22a6..e4d32b6eefb9b1 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -66,8 +66,8 @@ static cl::opt<bool> GCNTrackers(
const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
- : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), TheTracker(*C->LIS),
- TheUpwardTracker(*C->LIS), HasHighPressure(false) {}
+ : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), DownwardTracker(*C->LIS),
+ UpwardTracker(*C->LIS), HasHighPressure(false) {}
void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG);
@@ -157,8 +157,8 @@ static void getRegisterPressures(bool AtTop,
const RegPressureTracker &RPTracker, SUnit *SU,
std::vector<unsigned> &Pressure,
std::vector<unsigned> &MaxPressure,
- GCNDownwardRPTracker &TheTracker,
- GCNUpwardRPTracker &TheUpwardTracker,
+ GCNDownwardRPTracker &DownwardTracker,
+ GCNUpwardRPTracker &UpwardTracker,
ScheduleDAGMI *DAG) {
// getDownwardPressure() and getUpwardPressure() make temporary changes to
// the tracker, so we need to pass those function a non-const copy.
@@ -170,7 +170,7 @@ static void getRegisterPressures(bool AtTop,
TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
} else {
if (AtTop) {
- GCNDownwardRPTracker TempTopTracker(TheTracker);
+ GCNDownwardRPTracker TempTopTracker(DownwardTracker);
auto MI = SU->getInstr();
TempTopTracker.advance(MI, false, DAG->getLIS());
@@ -181,7 +181,7 @@ static void getRegisterPressures(bool AtTop,
}
else {
- GCNUpwardRPTracker TempBotTracker(TheUpwardTracker);
+ GCNUpwardRPTracker TempBotTracker(UpwardTracker);
auto MI = SU->getInstr();
TempBotTracker.recede(*MI, false);
@@ -221,7 +221,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
// PressureDiffs.
if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
- TheTracker, TheUpwardTracker, DAG);
+ DownwardTracker, UpwardTracker, DAG);
} else {
// Reserve 4 slots.
Pressure.resize(4, 0);
@@ -240,7 +240,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
#ifdef EXPENSIVE_CHECKS
std::vector<unsigned> CheckPressure, CheckMaxPressure;
getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
- TheTracker, TheUpwardTracker, DAG);
+ TheTracker, UpwardTracker, DAG);
if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
@@ -330,13 +330,13 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
if (DAG->isTrackingPressure()) {
SGPRPressure =
GCNTrackers
- ? (Zone.isTop() ? TheTracker.getPressure().getSGPRNum()
- : TheUpwardTracker.getPressure().getSGPRNum())
+ ? (Zone.isTop() ? DownwardTracker.getPressure().getSGPRNum()
+ : UpwardTracker.getPressure().getSGPRNum())
: Pressure[AMDGPU::RegisterPressureSets::SReg_32];
VGPRPressure =
GCNTrackers
- ? (Zone.isTop() ? TheTracker.getPressure().getVGPRNum(false)
- : TheUpwardTracker.getPressure().getVGPRNum(false))
+ ? (Zone.isTop() ? DownwardTracker.getPressure().getVGPRNum(false)
+ : UpwardTracker.getPressure().getVGPRNum(false))
: Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
}
ReadyQueue &Q = Zone.Available;
@@ -489,8 +489,8 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
if (GCNTrackers) {
MachineInstr *MI = SU->getInstr();
- IsTopNode ? (void)TheTracker.advance(MI, false, DAG->getLIS())
- : TheUpwardTracker.recede(*MI, false);
+ IsTopNode ? (void)DownwardTracker.advance(MI, false, DAG->getLIS())
+ : UpwardTracker.recede(*MI, false);
}
return GenericScheduler::schedNode(SU, IsTopNode);
@@ -836,18 +836,17 @@ void GCNScheduleDAGMILive::runSchedStages() {
}
if (GCNTrackers) {
- GCNDownwardRPTracker *TheTracker = S.getTracker();
- GCNUpwardRPTracker *TheUpwardTracker = S.getUpwardTracker();
- GCNRPTracker::LiveRegSet *RegionLiveIns =
- &LiveIns[Stage->getRegionIdx()];
-
- reinterpret_cast<GCNRPTracker *>(TheTracker)
- ->reset(Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
- *RegionLiveIns);
- reinterpret_cast<GCNRPTracker *>(TheUpwardTracker)
- ->reset(
- Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
- RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx()));
+ GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
+ GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
+ GCNRPTracker::LiveRegSet *RegionLiveIns = &LiveIns[Stage->getRegionIdx()];
+
+ reinterpret_cast<GCNRPTracker *>(DownwardTracker)->reset(
+ Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
+ *RegionLiveIns);
+ reinterpret_cast<GCNRPTracker *>(UpwardTracker)->reset(
+ Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
+ RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx()));
+
}
ScheduleDAGMILive::schedule();
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index e8c89b2f1baf27..91b4c0c63d2bb3 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -71,10 +71,10 @@ class GCNSchedStrategy : public GenericScheduler {
SmallVectorImpl<GCNSchedStageID>::iterator CurrentStage = nullptr;
// GCN RP Tracker for top-down scheduling
- mutable GCNDownwardRPTracker TheTracker;
+ mutable GCNDownwardRPTracker DownwardTracker;
// GCN RP Tracker for botttom-up scheduling
- mutable GCNUpwardRPTracker TheUpwardTracker;
+ mutable GCNUpwardRPTracker UpwardTracker;
public:
// schedule() have seen register pressure over the critical limits and had to
@@ -125,9 +125,9 @@ class GCNSchedStrategy : public GenericScheduler {
GCNSchedStageID getNextStage() const;
- GCNDownwardRPTracker *getTracker() { return &TheTracker; }
+ GCNDownwardRPTracker *getDownwardTracker() { return &DownwardTracker; }
- GCNUpwardRPTracker *getUpwardTracker() { return &TheUpwardTracker; }
+ GCNUpwardRPTracker *getUpwardTracker() { return &UpwardTracker; }
};
/// The goal of this scheduling strategy is to maximize kernel occupancy (i.e.
>From e583efa06e999342104df9e1a2fb4d9bb5f64641 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 28 May 2024 13:29:41 -0700
Subject: [PATCH 07/24] Use DAG.MRI
Change-Id: I9f0275a0cede9e77dfd29262124f2a856f436c8c
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 12 +++++-------
1 file changed, 5 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index e4d32b6eefb9b1..c3bee344764160 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -840,13 +840,11 @@ void GCNScheduleDAGMILive::runSchedStages() {
GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
GCNRPTracker::LiveRegSet *RegionLiveIns = &LiveIns[Stage->getRegionIdx()];
- reinterpret_cast<GCNRPTracker *>(DownwardTracker)->reset(
- Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
- *RegionLiveIns);
- reinterpret_cast<GCNRPTracker *>(UpwardTracker)->reset(
- Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(),
- RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx()));
-
+ reinterpret_cast<GCNRPTracker *>(DownwardTracker)
+ ->reset(MRI, *RegionLiveIns);
+ reinterpret_cast<GCNRPTracker *>(UpwardTracker)
+ ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
+ Stage->getRegionIdx()));
}
ScheduleDAGMILive::schedule();
>From a8396a4b343dca0f37faafadf63ce32191b0d55f Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 28 May 2024 13:52:29 -0700
Subject: [PATCH 08/24] Formatting
Change-Id: I74c19a2cf20d2325178933f81e0e8716d7c62f17
---
llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp | 2 +-
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 15 ++++++++-------
2 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index e89016b0ae984e..da065e8d8cb6b8 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -480,7 +480,7 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
"target occupancy = "
<< TgtOcc << '\n');
- GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler=*/ true);
+ GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler=*/true);
unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
for (int I = 0; I < NumPasses; ++I) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index c3bee344764160..724ffa4494323c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -66,8 +66,9 @@ static cl::opt<bool> GCNTrackers(
const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
- : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), DownwardTracker(*C->LIS),
- UpwardTracker(*C->LIS), HasHighPressure(false) {}
+ : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
+ DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
+}
void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG);
@@ -329,10 +330,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
unsigned VGPRPressure = 0;
if (DAG->isTrackingPressure()) {
SGPRPressure =
- GCNTrackers
- ? (Zone.isTop() ? DownwardTracker.getPressure().getSGPRNum()
- : UpwardTracker.getPressure().getSGPRNum())
- : Pressure[AMDGPU::RegisterPressureSets::SReg_32];
+ GCNTrackers ? (Zone.isTop() ? DownwardTracker.getPressure().getSGPRNum()
+ : UpwardTracker.getPressure().getSGPRNum())
+ : Pressure[AMDGPU::RegisterPressureSets::SReg_32];
VGPRPressure =
GCNTrackers
? (Zone.isTop() ? DownwardTracker.getPressure().getVGPRNum(false)
@@ -838,7 +838,8 @@ void GCNScheduleDAGMILive::runSchedStages() {
if (GCNTrackers) {
GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
- GCNRPTracker::LiveRegSet *RegionLiveIns = &LiveIns[Stage->getRegionIdx()];
+ GCNRPTracker::LiveRegSet *RegionLiveIns =
+ &LiveIns[Stage->getRegionIdx()];
reinterpret_cast<GCNRPTracker *>(DownwardTracker)
->reset(MRI, *RegionLiveIns);
>From 349cb7ea8dfea366b50edfe3ae2270fb38c0f8f0 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 14 Jun 2024 15:03:02 -0700
Subject: [PATCH 09/24] Review comments
Change-Id: I09f9ca74c07b516daed0e93a85937df8b9aa922b
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 19 ++++++++++---------
1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 724ffa4494323c..5006ea37e2564b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -329,15 +329,16 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
unsigned SGPRPressure = 0;
unsigned VGPRPressure = 0;
if (DAG->isTrackingPressure()) {
- SGPRPressure =
- GCNTrackers ? (Zone.isTop() ? DownwardTracker.getPressure().getSGPRNum()
- : UpwardTracker.getPressure().getSGPRNum())
- : Pressure[AMDGPU::RegisterPressureSets::SReg_32];
- VGPRPressure =
- GCNTrackers
- ? (Zone.isTop() ? DownwardTracker.getPressure().getVGPRNum(false)
- : UpwardTracker.getPressure().getVGPRNum(false))
- : Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+ if (!GCNTrackers) {
+ SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
+ VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+ } else {
+ GCNRPTracker *T = &UpwardTracker;
+ if (Zone.isTop())
+ T = &DownwardTracker;
+ SGPRPressure = T->getPressure().getSGPRNum();
+ VGPRPressure = T->getPressure().getVGPRNum(false);
+ }
}
ReadyQueue &Q = Zone.Available;
for (SUnit *SU : Q) {
>From 01beddbec69dea1febf6c226d5dee86817bd5324 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 14 Jun 2024 16:14:57 -0700
Subject: [PATCH 10/24] Allocate Pressure vector
Change-Id: I5effce973fa2d945076e89b4453a844f0fc85fc9
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 5006ea37e2564b..cdafa01eeb857a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -170,6 +170,7 @@ static void getRegisterPressures(bool AtTop,
else
TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
} else {
+ Pressure.resize(4, 0);
if (AtTop) {
GCNDownwardRPTracker TempTopTracker(DownwardTracker);
auto MI = SU->getInstr();
>From 4cad2b9d99ae64b4ad8b752ed14cd37a05be8a24 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 18 Jun 2024 11:39:48 -0700
Subject: [PATCH 11/24] Remove flag from upward RPTracker
Change-Id: I6217c03f56d34f584e5b23cf7c4462842bc7173b
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index cdafa01eeb857a..0c7639462905d7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -185,7 +185,7 @@ static void getRegisterPressures(bool AtTop,
else {
GCNUpwardRPTracker TempBotTracker(UpwardTracker);
auto MI = SU->getInstr();
- TempBotTracker.recede(*MI, false);
+ TempBotTracker.recede(*MI);
Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
TempBotTracker.getPressure().getSGPRNum();
@@ -492,7 +492,7 @@ void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
if (GCNTrackers) {
MachineInstr *MI = SU->getInstr();
IsTopNode ? (void)DownwardTracker.advance(MI, false, DAG->getLIS())
- : UpwardTracker.recede(*MI, false);
+ : UpwardTracker.recede(*MI);
}
return GenericScheduler::schedNode(SU, IsTopNode);
>From cb5c92603ea2967ddcb2dcc936029f02da237e97 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 19 Jun 2024 11:45:32 -0700
Subject: [PATCH 12/24] Review comments
Change-Id: Ibeaba6cab034636472b20c36adfadabbbc2c19ef
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 53 ++++++++++-----------
1 file changed, 25 insertions(+), 28 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 0c7639462905d7..c5d217d80a7c8a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -165,33 +165,30 @@ static void getRegisterPressures(bool AtTop,
// the tracker, so we need to pass those function a non-const copy.
RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
if (!GCNTrackers) {
- if (AtTop)
- TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
- else
- TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
- } else {
- Pressure.resize(4, 0);
- if (AtTop) {
- GCNDownwardRPTracker TempTopTracker(DownwardTracker);
- auto MI = SU->getInstr();
- TempTopTracker.advance(MI, false, DAG->getLIS());
-
- Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
- TempTopTracker.getPressure().getSGPRNum();
- Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
- TempTopTracker.getPressure().getVGPRNum(false);
- }
+ AtTop
+ ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
+ : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
- else {
- GCNUpwardRPTracker TempBotTracker(UpwardTracker);
- auto MI = SU->getInstr();
- TempBotTracker.recede(*MI);
+ return;
+ }
- Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
- TempBotTracker.getPressure().getSGPRNum();
- Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
- TempBotTracker.getPressure().getVGPRNum(false);
- }
+ // GCNTrackers
+ Pressure.resize(4, 0);
+ MachineInstr *MI = SU->getInstr();
+ if (AtTop) {
+ GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
+ TempDownwardTracker.advance(MI, false, DAG->getLIS());
+ Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
+ TempDownwardTracker.getPressure().getSGPRNum();
+ Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
+ TempDownwardTracker.getPressure().getVGPRNum(false);
+ } else {
+ GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
+ TempUpwardTracker.recede(*MI);
+ Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
+ TempUpwardTracker.getPressure().getSGPRNum();
+ Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
+ TempUpwardTracker.getPressure().getVGPRNum(false);
}
}
@@ -334,9 +331,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
} else {
- GCNRPTracker *T = &UpwardTracker;
- if (Zone.isTop())
- T = &DownwardTracker;
+ GCNRPTracker *T = Zone.isTop()
+ ? static_cast<GCNRPTracker *>(&UpwardTracker)
+ : static_cast<GCNRPTracker *>(&DownwardTracker);
SGPRPressure = T->getPressure().getSGPRNum();
VGPRPressure = T->getPressure().getVGPRNum(false);
}
>From 80534d30e23b60d16e87e19ee8b9276dd8a0a88c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 20 Jun 2024 08:49:26 -0700
Subject: [PATCH 13/24] Dont modify existing PreRARematStage LiveIn handling
Change-Id: I96c99f12c59ef0eea86f7fbf134913ecc47dd6f2
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index c5d217d80a7c8a..d48e33f7df950a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1669,6 +1669,9 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
MachineInstr *MI = Entry.first;
MachineInstr *OldMI = Entry.second;
+ // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
+ DAG.BBLiveInMap.erase(OldMI);
+
// Remove OldMI and update LIS
Register Reg = MI->getOperand(0).getReg();
LIS->RemoveMachineInstrFromMaps(*OldMI);
@@ -1686,8 +1689,6 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
DAG.Regions = NewRegions;
DAG.RescheduleRegions = NewRescheduleRegions;
- DAG.BBLiveInMap = DAG.getBBLiveInMap();
-
if (GCNTrackers)
DAG.RegionLiveOuts.buildLiveRegMap();
>From 28b520d4a3c18e58b865586571373c9fbe7cf687 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 12 Aug 2024 13:55:44 -0700
Subject: [PATCH 14/24] Use GCNTracker RP speculation
Change-Id: I3e893ca2ffcf1032fe157b537c9563565215b123
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index d48e33f7df950a..7ce8d8c56baf56 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -177,18 +177,18 @@ static void getRegisterPressures(bool AtTop,
MachineInstr *MI = SU->getInstr();
if (AtTop) {
GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
- TempDownwardTracker.advance(MI, false, DAG->getLIS());
+ TempDownwardTracker.bumpDownwardPressure(MI);
Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
TempDownwardTracker.getPressure().getSGPRNum();
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
- TempDownwardTracker.getPressure().getVGPRNum(false);
+ TempDownwardTracker.getPressure().getArchVGPRNum();
} else {
GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
- TempUpwardTracker.recede(*MI);
+ TempUpwardTracker.bumpUpwardPressure(MI);
Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
TempUpwardTracker.getPressure().getSGPRNum();
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
- TempUpwardTracker.getPressure().getVGPRNum(false);
+ TempUpwardTracker.getPressure().getArchVGPRNum();
}
}
>From de185daec81d9ca82bb84bbc09ac68244ebad139 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 20 Aug 2024 12:29:33 -0700
Subject: [PATCH 15/24] Port changes from pull/93088
Change-Id: I2de464b32d3c6ed9a77cbbc669d735dde63c2e47
---
llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 45 +++++++++++++----------
1 file changed, 25 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index d1a50adc1918cf..cbcef5faf21ed9 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -299,11 +299,11 @@ static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
return I->LaneMask;
}
-static LaneBitmask
-getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
- bool TrackLaneMasks, Register RegUnit, SlotIndex Pos,
- LaneBitmask SafeDefault,
- bool (*Property)(const LiveRange &LR, SlotIndex Pos)) {
+static LaneBitmask getLanesWithProperty(
+ const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
+ bool TrackLaneMasks, Register RegUnit, SlotIndex Pos,
+ LaneBitmask SafeDefault,
+ function_ref<bool(const LiveRange &LR, SlotIndex Pos)> Property) {
if (RegUnit.isVirtual()) {
const LiveInterval &LI = LIS.getInterval(RegUnit);
LaneBitmask Result;
@@ -318,14 +318,14 @@ getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
}
return Result;
- } else {
- const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
- // Be prepared for missing liveranges: We usually do not compute liveranges
- // for physical registers on targets with many registers (GPUs).
- if (LR == nullptr)
- return SafeDefault;
- return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone();
}
+
+ const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
+ // Be prepared for missing liveranges: We usually do not compute liveranges
+ // for physical registers on targets with many registers (GPUs).
+ if (LR == nullptr)
+ return SafeDefault;
+ return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone();
}
/// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
@@ -334,19 +334,21 @@ getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
SlotIndex PriorUseIdx, SlotIndex NextUseIdx,
const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *TRI,
const LiveIntervals *LIS,
bool Upward = false) {
- const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
- for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+ for (const MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) {
if (MO.isUndef())
continue;
+ if (!MO.readsReg())
+ continue;
const MachineInstr *MI = MO.getParent();
SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
bool InRange = Upward ? (InstSlot > PriorUseIdx && InstSlot <= NextUseIdx)
: (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx);
if (InRange) {
unsigned SubRegIdx = MO.getSubReg();
- LaneBitmask UseMask = TRI.getSubRegIndexLaneMask(SubRegIdx);
+ LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(SubRegIdx);
LastUseMask &= ~UseMask;
if (LastUseMask.none())
return LaneBitmask::getNone();
@@ -518,7 +520,9 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI) {
// Account for register pressure similar to RegPressureTracker::recede().
RegisterOperands RegOpers;
- const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+
+ const SIRegisterInfo *TRI =
+ MI->getMF()->getSubtarget<GCNSubtarget>().getRegisterInfo();
RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/true);
assert(RegOpers.DeadDefs.empty());
RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
@@ -559,8 +563,8 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI) {
LastTrackedMI ? LIS.getInstructionIndex(*LastTrackedMI).getRegSlot()
: LIS.getMBBEndIdx(MI->getParent());
;
- LaneBitmask LastUseMask =
- findUseBetween(Reg, P.LaneMask, SlotIdx, CurrIdx, *MRI, &LIS, true);
+ LaneBitmask LastUseMask = findUseBetween(Reg, P.LaneMask, SlotIdx, CurrIdx,
+ *MRI, TRI, &LIS, true);
LastUseMask &= ~LiveAfter;
LaneBitmask LiveBefore = (LiveAfter | LastUseMask);
CurPressure.inc(Reg, LiveAfter, LiveBefore, *MRI);
@@ -734,7 +738,8 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI) {
// Account for register pressure similar to RegPressureTracker::recede().
RegisterOperands RegOpers;
- const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+ const SIRegisterInfo *TRI =
+ MI->getMF()->getSubtarget<GCNSubtarget>().getRegisterInfo();
RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/false);
RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
@@ -761,7 +766,7 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI) {
}
LastUseMask =
- findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, &LIS);
+ findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, TRI, &LIS);
if (LastUseMask.none())
continue;
>From ad2e468853ad93265a1a7206469472223f6ac854 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 21 Aug 2024 15:16:05 -0700
Subject: [PATCH 16/24] Port changes from pull/93088
---
llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 16 +++++-----------
llvm/lib/Target/AMDGPU/GCNRegPressure.h | 4 ++--
2 files changed, 7 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index cbcef5faf21ed9..58d34546b9e748 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -321,8 +321,6 @@ static LaneBitmask getLanesWithProperty(
}
const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
- // Be prepared for missing liveranges: We usually do not compute liveranges
- // for physical registers on targets with many registers (GPUs).
if (LR == nullptr)
return SafeDefault;
return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone();
@@ -337,11 +335,9 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
const SIRegisterInfo *TRI,
const LiveIntervals *LIS,
bool Upward = false) {
- for (const MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) {
+ for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
if (MO.isUndef())
continue;
- if (!MO.readsReg())
- continue;
const MachineInstr *MI = MO.getParent();
SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
bool InRange = Upward ? (InstSlot > PriorUseIdx && InstSlot <= NextUseIdx)
@@ -513,7 +509,8 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
assert(CurPressure == getRegPressure(*MRI, LiveRegs));
}
-void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI) {
+void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI,
+ const SIRegisterInfo *TRI) {
assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
SlotIndex SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot();
@@ -521,8 +518,6 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI) {
// Account for register pressure similar to RegPressureTracker::recede().
RegisterOperands RegOpers;
- const SIRegisterInfo *TRI =
- MI->getMF()->getSubtarget<GCNSubtarget>().getRegisterInfo();
RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/true);
assert(RegOpers.DeadDefs.empty());
RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
@@ -730,7 +725,8 @@ Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
});
}
-void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI) {
+void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
+ const SIRegisterInfo *TRI) {
assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
SlotIndex SlotIdx;
@@ -738,8 +734,6 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI) {
// Account for register pressure similar to RegPressureTracker::recede().
RegisterOperands RegOpers;
- const SIRegisterInfo *TRI =
- MI->getMF()->getSubtarget<GCNSubtarget>().getRegisterInfo();
RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/false);
RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index f78e4d7da0a1dd..5f9434f91efc64 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -222,7 +222,7 @@ class GCNUpwardRPTracker : public GCNRPTracker {
/// to false allows for an externally managed iterator / program order.
void recede(const MachineInstr &MI);
- void bumpUpwardPressure(const MachineInstr *MI);
+ void bumpUpwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI);
/// \p returns whether the tracker's state after receding MI corresponds
/// to reported by LIS.
@@ -306,7 +306,7 @@ class GCNDownwardRPTracker : public GCNRPTracker {
MachineBasicBlock::const_iterator End,
const LiveRegSet *LiveRegsCopy = nullptr);
- void bumpDownwardPressure(const MachineInstr *MI);
+ void bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI);
};
LaneBitmask getLiveLaneMask(unsigned Reg,
>From 0ec89ac36cd8c36054e7a2edbec0c4c76c8f78ef Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 21 Aug 2024 15:34:33 -0700
Subject: [PATCH 17/24] Feed SIRegisterInfo to Trackers + Propagate unused AGPR
speculative pressure + Use correct previous VGPR pressure
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 26 +++++++++++----------
1 file changed, 14 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 7ce8d8c56baf56..bf812e840b876c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -154,13 +154,11 @@ static bool canUsePressureDiffs(const SUnit &SU) {
return true;
}
-static void getRegisterPressures(bool AtTop,
- const RegPressureTracker &RPTracker, SUnit *SU,
- std::vector<unsigned> &Pressure,
- std::vector<unsigned> &MaxPressure,
- GCNDownwardRPTracker &DownwardTracker,
- GCNUpwardRPTracker &UpwardTracker,
- ScheduleDAGMI *DAG) {
+static void getRegisterPressures(
+ bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
+ std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
+ GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
+ ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
// getDownwardPressure() and getUpwardPressure() make temporary changes to
// the tracker, so we need to pass those function a non-const copy.
RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
@@ -177,18 +175,22 @@ static void getRegisterPressures(bool AtTop,
MachineInstr *MI = SU->getInstr();
if (AtTop) {
GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
- TempDownwardTracker.bumpDownwardPressure(MI);
+ TempDownwardTracker.bumpDownwardPressure(MI, SRI);
Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
TempDownwardTracker.getPressure().getSGPRNum();
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
TempDownwardTracker.getPressure().getArchVGPRNum();
+ Pressure[AMDGPU::RegisterPressureSets::AGPR_32] =
+ TempDownwardTracker.getPressure().getAGPRNum();
} else {
GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
- TempUpwardTracker.bumpUpwardPressure(MI);
+ TempUpwardTracker.bumpUpwardPressure(MI, SRI);
Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
TempUpwardTracker.getPressure().getSGPRNum();
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
TempUpwardTracker.getPressure().getArchVGPRNum();
+ Pressure[AMDGPU::RegisterPressureSets::AGPR_32] =
+ TempDownwardTracker.getPressure().getAGPRNum();
}
}
@@ -220,7 +222,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
// PressureDiffs.
if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
- DownwardTracker, UpwardTracker, DAG);
+ DownwardTracker, UpwardTracker, DAG, SRI);
} else {
// Reserve 4 slots.
Pressure.resize(4, 0);
@@ -239,7 +241,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
#ifdef EXPENSIVE_CHECKS
std::vector<unsigned> CheckPressure, CheckMaxPressure;
getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
- TheTracker, UpwardTracker, DAG);
+ TheTracker, UpwardTracker, DAG, SRI);
if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
@@ -335,7 +337,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
? static_cast<GCNRPTracker *>(&UpwardTracker)
: static_cast<GCNRPTracker *>(&DownwardTracker);
SGPRPressure = T->getPressure().getSGPRNum();
- VGPRPressure = T->getPressure().getVGPRNum(false);
+ VGPRPressure = T->getPressure().getArchVGPRNum();
}
}
ReadyQueue &Q = Zone.Available;
>From adcd2c741516bd4ea7b13dc8e5b58e73e0cdb2cc Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 5 Sep 2024 08:24:43 -0700
Subject: [PATCH 18/24] Review comments
Change-Id: I286c9ed1ae91a68da881c6fa27f5f391102d0a9c
---
llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 68 +++++++++++++--------
llvm/lib/Target/AMDGPU/GCNRegPressure.h | 11 ++++
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 2 +-
3 files changed, 54 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 58d34546b9e748..c47d0c0d613dd5 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -289,6 +289,7 @@ collectVirtualRegUses(SmallVectorImpl<RegisterMaskPair> &RegMaskPairs,
}
}
+/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
Register RegUnit) {
auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
@@ -299,6 +300,7 @@ static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
return I->LaneMask;
}
+/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
static LaneBitmask getLanesWithProperty(
const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
bool TrackLaneMasks, Register RegUnit, SlotIndex Pos,
@@ -326,6 +328,7 @@ static LaneBitmask getLanesWithProperty(
return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone();
}
+/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
/// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
/// The query starts with a lane bitmask which gets lanes/bits removed for every
/// use we find.
@@ -353,6 +356,35 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
return LastUseMask;
}
+/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
+static LaneBitmask getLiveLanesAt(const LiveIntervals &LIS,
+ const MachineRegisterInfo &MRI,
+ bool TrackLaneMasks, Register RegUnit,
+ SlotIndex Pos) {
+ return getLanesWithProperty(
+ LIS, MRI, TrackLaneMasks, RegUnit, Pos, LaneBitmask::getAll(),
+ [](const LiveRange &LR, SlotIndex Pos) { return LR.liveAt(Pos); });
+}
+
+// Copy/paste from RegisterPressure.cpp (RegisterOperands::adjustLaneLiveness)
+static void adjustDefLaneLiveness(SmallVectorImpl<RegisterMaskPair> &Defs,
+ SlotIndex &Pos, const LiveIntervals &LIS,
+ const MachineRegisterInfo &MRI) {
+ for (auto *I = Defs.begin(); I != Defs.end();) {
+ LaneBitmask LiveAfter =
+ getLiveLanesAt(LIS, MRI, true, I->RegUnit, Pos.getDeadSlot());
+ // If the def is all that is live after the instruction, then in case
+ // of a subregister def we need a read-undef flag.
+ LaneBitmask ActualDef = I->LaneMask & LiveAfter;
+ if (ActualDef.none()) {
+ I = Defs.erase(I);
+ } else {
+ I->LaneMask = ActualDef;
+ ++I;
+ }
+ }
+}
+
///////////////////////////////////////////////////////////////////////////////
// GCNRPTracker
@@ -417,6 +449,7 @@ void GCNRPTracker::reset(const MachineRegisterInfo &MRI_,
}
void GCNRPTracker::bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs) {
+ GCNRegPressure TempPressure = CurPressure;
for (const RegisterMaskPair &P : DeadDefs) {
Register Reg = P.RegUnit;
if (!Reg.isVirtual())
@@ -426,16 +459,9 @@ void GCNRPTracker::bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs) {
CurPressure.inc(Reg, LiveMask, BumpedMask, *MRI);
}
MaxPressure = max(MaxPressure, CurPressure);
- for (const RegisterMaskPair &P : DeadDefs) {
- Register Reg = P.RegUnit;
- if (!Reg.isVirtual())
- continue;
- LaneBitmask LiveMask = LiveRegs[Reg];
- LaneBitmask BumpedMask = LiveMask | P.LaneMask;
- CurPressure.inc(Reg, BumpedMask, LiveMask, *MRI);
- }
+ CurPressure = TempPressure;
}
-
+/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit,
SlotIndex Pos) const {
return getLanesWithProperty(
@@ -520,7 +546,7 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI,
RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/true);
assert(RegOpers.DeadDefs.empty());
- RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
+ adjustDefLaneLiveness(RegOpers.Defs, SlotIdx, LIS, *MRI);
RegOpers.detectDeadDefs(*MI, LIS);
// Boost max pressure for all dead defs together.
@@ -537,11 +563,7 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI,
LaneBitmask DefLanes = P.LaneMask;
LaneBitmask LiveBefore = (LiveAfter & ~DefLanes) | UseLanes;
- // There may be parts of the register that were dead before the
- // instruction, but became live afterwards. Similarly, some parts
- // may have been killed in this instruction.
CurPressure.inc(Reg, LiveAfter, LiveAfter & LiveBefore, *MRI);
- CurPressure.inc(Reg, LiveAfter, ~LiveAfter & LiveBefore, *MRI);
MaxPressure = max(MaxPressure, CurPressure);
}
// Generate liveness for uses.
@@ -549,19 +571,8 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI,
Register Reg = P.RegUnit;
if (!Reg.isVirtual())
continue;
- // If this register was also in a def operand, we've handled it
- // with defs.
- if (getRegLanes(RegOpers.Defs, Reg).any())
- continue;
LaneBitmask LiveAfter = LiveRegs[Reg];
- SlotIndex CurrIdx =
- LastTrackedMI ? LIS.getInstructionIndex(*LastTrackedMI).getRegSlot()
- : LIS.getMBBEndIdx(MI->getParent());
- ;
- LaneBitmask LastUseMask = findUseBetween(Reg, P.LaneMask, SlotIdx, CurrIdx,
- *MRI, TRI, &LIS, true);
- LastUseMask &= ~LiveAfter;
- LaneBitmask LiveBefore = (LiveAfter | LastUseMask);
+ LaneBitmask LiveBefore = LiveAfter | P.LaneMask;
CurPressure.inc(Reg, LiveAfter, LiveBefore, *MRI);
}
MaxPressure = max(MaxPressure, CurPressure);
@@ -682,8 +693,13 @@ bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator,
LiveIntervals *TheLIS) {
if (UseInternalIterator && NextMI == MBBEnd)
return false;
+
advanceBeforeNext(MI, UseInternalIterator, TheLIS);
advanceToNext(MI, UseInternalIterator);
+ if (!UseInternalIterator) {
+ // We must remove any dead def lanes from the current RP
+ advanceBeforeNext(MI, true, TheLIS);
+ }
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 5f9434f91efc64..463da472bb69ff 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -169,6 +169,7 @@ class GCNRPTracker {
void reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy,
bool After);
+ /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
void bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs);
LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const;
@@ -222,6 +223,11 @@ class GCNUpwardRPTracker : public GCNRPTracker {
/// to false allows for an externally managed iterator / program order.
void recede(const MachineInstr &MI);
+ /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
+ /// Calculate the impact \p MI will have on CurPressure and MaxPressure. This
+ /// does not rely on the implicit program ordering in the LiveIntervals to
+ /// support RP Speculation. It leaves the state of pressure inconsistent with
+ /// the current position
void bumpUpwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI);
/// \p returns whether the tracker's state after receding MI corresponds
@@ -306,6 +312,11 @@ class GCNDownwardRPTracker : public GCNRPTracker {
MachineBasicBlock::const_iterator End,
const LiveRegSet *LiveRegsCopy = nullptr);
+ /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
+ /// Calculate the impact \p MI will have on CurPressure and MaxPressure. This
+ /// does not rely on the implicit program ordering in the LiveIntervals to
+ /// support RP Speculation. It leaves the state of pressure inconsistent with
+ /// the current position
void bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI);
};
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index bf812e840b876c..651f25c80d60c7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -190,7 +190,7 @@ static void getRegisterPressures(
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
TempUpwardTracker.getPressure().getArchVGPRNum();
Pressure[AMDGPU::RegisterPressureSets::AGPR_32] =
- TempDownwardTracker.getPressure().getAGPRNum();
+ TempUpwardTracker.getPressure().getAGPRNum();
}
}
>From 7e35229dc8080b5735f5b63513cffc0183676ff3 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 18 Sep 2024 12:59:36 -0700
Subject: [PATCH 19/24] Avoid const_cast
Change-Id: Ib7b21b2ab4cc44abc61fb8ad8880fb78f831619a
---
llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index c47d0c0d613dd5..fb92924363d43b 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -599,15 +599,15 @@ bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI,
LiveIntervals *TheLIS) {
assert(MRI && "call reset first");
SlotIndex SI;
- LiveIntervals *CurrLIS;
- MachineInstr *CurrMI;
+ const LiveIntervals *CurrLIS;
+ const MachineInstr *CurrMI;
if (UseInternalIterator) {
if (!LastTrackedMI)
return NextMI == MBBEnd;
assert(NextMI == MBBEnd || !NextMI->isDebugInstr());
- CurrLIS = const_cast<LiveIntervals *>(&LIS);
- CurrMI = const_cast<MachineInstr *>(LastTrackedMI);
+ CurrLIS = &LIS;
+ CurrMI = LastTrackedMI;
SI = NextMI == MBBEnd
? CurrLIS->getInstructionIndex(*LastTrackedMI).getDeadSlot()
@@ -673,7 +673,7 @@ void GCNDownwardRPTracker::advanceToNext(MachineInstr *MI,
LastTrackedMI = MI;
}
- MachineInstr *CurrMI = const_cast<MachineInstr *>(LastTrackedMI);
+ const MachineInstr *CurrMI = LastTrackedMI;
// Add new registers or mask bits.
for (const auto &MO : CurrMI->all_defs()) {
>From af9f2200e025dcd1d2d54e6bbc96d3064409a516 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 23 Sep 2024 10:08:21 -0700
Subject: [PATCH 20/24] Fix shouldTrackVGPRs calculation
Change-Id: I3d0aae74f20927722cd6844b1d586ae7accab86e
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 651f25c80d60c7..28ca41d2dc96ed 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -333,7 +333,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
} else {
- GCNRPTracker *T = Zone.isTop()
+ GCNRPTracker *T = IsBottomUp
? static_cast<GCNRPTracker *>(&UpwardTracker)
: static_cast<GCNRPTracker *>(&DownwardTracker);
SGPRPressure = T->getPressure().getSGPRNum();
>From 94c8ba8c4163aabe7bef96e98722e84a3ca4d66c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 27 Sep 2024 12:40:02 -0700
Subject: [PATCH 21/24] Add lit tests
Change-Id: I228916bf04add1de7615294d1e58ee4213f0bbde
---
.../CodeGen/AMDGPU/high-RP-reschedule.mir | 10 +-
llvm/test/CodeGen/AMDGPU/pr51516.mir | 6 +-
.../schedule-amdgpu-tracker-physreg-crash.ll | 65 ++
.../AMDGPU/schedule-amdgpu-tracker-physreg.ll | 491 +++++++++++++
.../AMDGPU/schedule-amdgpu-trackers.ll | 647 ++++++++++++++++++
...schedule-regpressure-ilp-metric-spills.mir | 15 +
.../AMDGPU/schedule-relaxed-occupancy.ll | 10 +-
7 files changed, 1240 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
diff --git a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir
index e9005e94ce5db7..d57450baea911a 100644
--- a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir
+++ b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir
@@ -1,11 +1,17 @@
# REQUIRES: asserts
-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -amdgpu-use-amdgpu-trackers=1 -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN-GCNTRACKER %s
--- |
define amdgpu_kernel void @high-RP-reschedule() { ret void }
...
-# CHECK: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4
+# GCN: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4
+
+# GCN-GCNTRACKER: Occupancy before scheduling: 3, after 4.
+# GCN-GCNTRACKER-NEXT: Ending scheduling stage: Max Occupancy Initial Schedule
+
+# When using the GCN Trackers, the scheduler is able to acieve desired occupancy without running high-RP-reschedule stage.
---
name: high-RP-reschedule
diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir
index 4be102f7860eab..49dd5c6c39ff5c 100644
--- a/llvm/test/CodeGen/AMDGPU/pr51516.mir
+++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir
@@ -1,4 +1,5 @@
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
# Check that %3 was not rematerialized before the last store since its operand %1
# is killed by that store.
@@ -7,6 +8,9 @@
# GCN: renamable $vgpr33_vgpr34_vgpr35_vgpr36 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr47, killed renamable $vgpr29_vgpr30_vgpr31_vgpr32, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr46
+# GCN-GCNTRACKER-LABEL: name: global_sextload_v32i32_to_v32i64
+# GCN-GCNTRACKER-NOT: SI_SPILL
+
---
name: global_sextload_v32i32_to_v32i64
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll
new file mode 100644
index 00000000000000..79187f51af0d2b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll
@@ -0,0 +1,65 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack -amdgpu-use-amdgpu-trackers=1 2>&1 < %s | FileCheck -check-prefixes=ERR-GCNTRACKERS %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack 2>&1 < %s | FileCheck -check-prefixes=GCN %s
+
+%asm.output = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs
+ <16 x i32>, <7 x i32>, ; vgprs
+ i64 ; vcc
+ }
+
+%asm.output2 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs
+ <16 x i32>, <5 x i32>, ; vgprs
+ i64 ; vcc
+ }
+
+%asm.output3 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, ; sgprs
+ <16 x i32>, <6 x i32>, ; vgprs
+ i64 ; vcc
+ }
+
+; ERR-GCNTRACKERS: ran out of registers during register allocation
+; GCN-NOT: ran out of registers during register allocation
+
+; FIXME: GCN Trackers do not track pressure from PhysRegs, so scheduling is actually worse
+
+define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 {
+ %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+ %alloca1 = alloca i32, align 4, addrspace(5)
+ call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+
+ %asm = call %asm.output asm sideeffect
+ "; def $0, $1, $2, $3, $4, $5, $6, $7, $8",
+ "={s[0:15]},={s[16:31]},={s[32:47]},={s[48:55]},={s[56:57]},={s58},={v[0:15]},={v[16:22]},={vcc}"()
+
+ %s0 = extractvalue %asm.output %asm, 0
+ %s1 = extractvalue %asm.output %asm, 1
+ %s2 = extractvalue %asm.output %asm, 2
+ %s3 = extractvalue %asm.output %asm, 3
+ %s4 = extractvalue %asm.output %asm, 4
+ %s5 = extractvalue %asm.output %asm, 5
+
+ %v0 = extractvalue %asm.output %asm, 6
+ %v1 = extractvalue %asm.output %asm, 7
+
+ %vcc = extractvalue %asm.output %asm, 8
+
+ ; scc is unavailable since it is live in
+ call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10",
+ "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:22]},{vcc},{s59},{scc}"(
+ <16 x i32> %s0,
+ <16 x i32> %s1,
+ <16 x i32> %s2,
+ <8 x i32> %s3,
+ <2 x i32> %s4,
+ i32 %s5,
+ <16 x i32> %v0,
+ <7 x i32> %v1,
+ i64 %vcc,
+ ptr addrspace(5) %alloca1,
+ i32 0) ; use of scc
+
+ ret void
+}
+
+attributes #0 = { nounwind alignstack=64 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" }
+attributes #1 = { nounwind alignstack=16 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" }
+
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
new file mode 100644
index 00000000000000..c490c76f4531de
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
@@ -0,0 +1,491 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 -amdgpu-use-amdgpu-trackers=1 < %s | FileCheck --check-prefix=GCN-GCNTRACKERS %s
+
+; CHECK-LABEL: {{^}}spill:
+; GCN: codeLenInByte = 1000
+; GCN-GCNTRACKERS: codeLenInByte = 1016
+; GCN: NumSgprs: 104
+; GCN-GCNTRACKERS: NumSgprs: 104
+; GCN: NumVgprs: 1
+; GCN-GCNTRACKERS: NumVgprs: 2
+; GCN: ScratchSize: 0
+; GCN-GCNTRACKERS: ScratchSize: 0
+; GCN: Occupancy: 5
+; GCN-GCNTRACKERS: Occupancy: 5
+
+; FIXME: GCN Trackers do not track pressure from PhysRegs, so scheduling is actually worse
+
+define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 {
+entry:
+ %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #0
+ %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #0
+ %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #0
+ %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #0
+ %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #0
+ %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #0
+ %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #0
+ %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #0
+ %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #0
+ %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #0
+ %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #0
+ %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #0
+ %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #0
+ %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #0
+ %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #0
+ %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #0
+ %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #0
+ %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #0
+ %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #0
+ %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #0
+ %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #0
+ %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #0
+ %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #0
+ %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #0
+ %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #0
+ %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #0
+ %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #0
+ %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #0
+ %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #0
+ %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #0
+ %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #0
+ %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #0
+ %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #0
+ %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #0
+ %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #0
+ %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #0
+ %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #0
+ %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #0
+ %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #0
+ %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #0
+ %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #0
+ %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #0
+ %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #0
+ %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #0
+ %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #0
+ %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #0
+ %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #0
+ %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #0
+ %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #0
+ %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #0
+ %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #0
+ %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #0
+ %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #0
+ %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #0
+ %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #0
+ %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #0
+ %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #0
+ %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #0
+ %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #0
+ %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #0
+ %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #0
+ %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #0
+ %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #0
+ %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #0
+ %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #0
+ %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #0
+ %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #0
+ %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #0
+ %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #0
+ %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #0
+ %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #0
+ %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #0
+ %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #0
+ %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #0
+ %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #0
+ %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #0
+ %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #0
+ %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #0
+ %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #0
+ %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #0
+ %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #0
+ %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #0
+ %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #0
+ %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #0
+ %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #0
+ %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #0
+ %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #0
+ %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #0
+ %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #0
+ %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #0
+ %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #0
+ %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #0
+ %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #0
+ %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #0
+ %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #0
+ %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #0
+ %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #0
+ %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #0
+ %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #0
+ %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #0
+ %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #0
+ %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #0
+ %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #0
+ %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #0
+ %cmp = icmp eq i32 %cnd, 0
+ br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
+
+bb2: ; 68 bytes
+ ; 64 byte asm
+ call void asm sideeffect
+ "v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64",""() #0
+ br label %bb3
+
+bb3:
+ tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #0
+ tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #0
+ tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #0
+ tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #0
+ tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #0
+ tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #0
+ tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #0
+ tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #0
+ tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #0
+ tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #0
+ tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #0
+ tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #0
+ tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #0
+ tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #0
+ tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #0
+ tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #0
+ tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #0
+ tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #0
+ tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #0
+ tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #0
+ tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #0
+ tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #0
+ tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #0
+ tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #0
+ tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #0
+ tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #0
+ tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #0
+ tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #0
+ tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #0
+ tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #0
+ tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #0
+ tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #0
+ tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #0
+ tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #0
+ tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #0
+ tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #0
+ tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #0
+ tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #0
+ tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #0
+ tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #0
+ tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #0
+ tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #0
+ tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #0
+ tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #0
+ tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #0
+ tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #0
+ tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #0
+ tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #0
+ tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #0
+ tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #0
+ tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #0
+ tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #0
+ tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #0
+ tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #0
+ tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #0
+ tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #0
+ tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #0
+ tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #0
+ tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #0
+ tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #0
+ tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #0
+ tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #0
+ tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #0
+ tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #0
+ tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #0
+ tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #0
+ tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #0
+ tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #0
+ tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #0
+ tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #0
+ tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #0
+ tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #0
+ tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #0
+ tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #0
+ tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #0
+ tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #0
+ tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #0
+ tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #0
+ tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #0
+ tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #0
+ tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #0
+ tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #0
+ tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #0
+ tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #0
+ tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #0
+ tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #0
+ tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #0
+ tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #0
+ tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #0
+ tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #0
+ tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #0
+ tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #0
+ tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #0
+ tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #0
+ tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #0
+ tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #0
+ tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #0
+ tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #0
+ tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #0
+ tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #0
+ tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #0
+ tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #0
+ tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #0
+ tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #0
+ ret void
+}
+
+; CHECK-LABEL: {{^}}spill_func:
+; GCN: codeLenInByte = 1612
+; GCN-GCNTRACKERS: codeLenInByte = 1660
+; GCN: NumSgprs: 104
+; GCN-GCNTRACKERS: NumSgprs: 104
+; GCN: NumVgprs: 3
+; GCN-GCNTRACKERS: NumVgprs: 4
+; GCN: ScratchSize: 12
+; GCN-GCNTRACKERS: ScratchSize: 16
+
+define void @spill_func(ptr addrspace(1) %arg) #0 {
+entry:
+ %cnd = tail call i32 @llvm.amdgcn.workgroup.id.x() #0
+ %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #0
+ %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #0
+ %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #0
+ %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #0
+ %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #0
+ %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #0
+ %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #0
+ %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #0
+ %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #0
+ %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #0
+ %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #0
+ %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #0
+ %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #0
+ %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #0
+ %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #0
+ %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #0
+ %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #0
+ %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #0
+ %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #0
+ %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #0
+ %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #0
+ %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #0
+ %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #0
+ %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #0
+ %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #0
+ %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #0
+ %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #0
+ %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #0
+ %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #0
+ %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #0
+ %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #0
+ %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #0
+ %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #0
+ %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #0
+ %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #0
+ %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #0
+ %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #0
+ %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #0
+ %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #0
+ %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #0
+ %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #0
+ %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #0
+ %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #0
+ %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #0
+ %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #0
+ %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #0
+ %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #0
+ %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #0
+ %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #0
+ %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #0
+ %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #0
+ %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #0
+ %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #0
+ %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #0
+ %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #0
+ %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #0
+ %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #0
+ %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #0
+ %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #0
+ %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #0
+ %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #0
+ %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #0
+ %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #0
+ %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #0
+ %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #0
+ %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #0
+ %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #0
+ %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #0
+ %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #0
+ %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #0
+ %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #0
+ %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #0
+ %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #0
+ %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #0
+ %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #0
+ %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #0
+ %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #0
+ %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #0
+ %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #0
+ %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #0
+ %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #0
+ %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #0
+ %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #0
+ %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #0
+ %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #0
+ %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #0
+ %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #0
+ %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #0
+ %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #0
+ %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #0
+ %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #0
+ %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #0
+ %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #0
+ %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #0
+ %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #0
+ %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #0
+ %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #0
+ %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #0
+ %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #0
+ %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #0
+ %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #0
+ %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #0
+ %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #0
+ %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #0
+ %cmp = icmp eq i32 %cnd, 0
+ br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
+
+bb2: ; 68 bytes
+ ; 64 byte asm
+ call void asm sideeffect
+ "v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64",""() #0
+ br label %bb3
+
+bb3:
+ tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #0
+ tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #0
+ tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #0
+ tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #0
+ tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #0
+ tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #0
+ tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #0
+ tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #0
+ tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #0
+ tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #0
+ tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #0
+ tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #0
+ tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #0
+ tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #0
+ tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #0
+ tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #0
+ tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #0
+ tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #0
+ tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #0
+ tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #0
+ tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #0
+ tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #0
+ tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #0
+ tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #0
+ tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #0
+ tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #0
+ tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #0
+ tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #0
+ tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #0
+ tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #0
+ tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #0
+ tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #0
+ tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #0
+ tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #0
+ tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #0
+ tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #0
+ tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #0
+ tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #0
+ tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #0
+ tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #0
+ tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #0
+ tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #0
+ tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #0
+ tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #0
+ tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #0
+ tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #0
+ tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #0
+ tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #0
+ tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #0
+ tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #0
+ tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #0
+ tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #0
+ tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #0
+ tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #0
+ tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #0
+ tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #0
+ tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #0
+ tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #0
+ tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #0
+ tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #0
+ tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #0
+ tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #0
+ tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #0
+ tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #0
+ tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #0
+ tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #0
+ tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #0
+ tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #0
+ tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #0
+ tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #0
+ tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #0
+ tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #0
+ tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #0
+ tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #0
+ tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #0
+ tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #0
+ tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #0
+ tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #0
+ tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #0
+ tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #0
+ tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #0
+ tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #0
+ tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #0
+ tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #0
+ tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #0
+ tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #0
+ tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #0
+ tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #0
+ tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #0
+ tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #0
+ tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #0
+ tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #0
+ tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #0
+ tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #0
+ tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #0
+ tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #0
+ tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #0
+ tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #0
+ tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #0
+ tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #0
+ tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #0
+ tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #0
+ tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #0
+ tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #0
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
new file mode 100644
index 00000000000000..53f533ebb28427
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -0,0 +1,647 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL-GCNTRACKERS %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -verify-misched < %s | FileCheck --check-prefixes=TONGA %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=TONGA-GCNTRACKERS %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched < %s | FileCheck --check-prefixes=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX908-GCNTRACKERS %s
+; RUN: llc -mtriple=amdgcn -verify-misched < %s | FileCheck --check-prefixes=GENERIC %s
+; RUN: llc -mtriple=amdgcn -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GENERIC-GCNTRACKERS %s
+
+; GCN Trackers are sensitive to minor changes in RP, and will avoid scheduling certain instructions, which, if scheduled,
+; allow scheduling of other instructions which reduce RP
+
+; CHECK-LABEL: {{^}}return_72xi32:
+; GFX11-PAL: codeLenInByte = 768
+; GFX11-PAL-GCNTRACKERS: codeLenInByte = 888
+; GFX11-PAL: NumSgprs: 33
+; GFX11-PAL-GCNTRACKERS: NumSgprs: 33
+; GFX11-PAL: NumVgprs: 64
+; GFX11-PAL-GCNTRACKERS: NumVgprs: 64
+; GFX11-PAL: ScratchSize: 220
+; GFX11-PAL-GCNTRACKERS: ScratchSize: 248
+
+
+; CHECK-LABEL: {{^}}call_72xi32:
+; GFX11-PAL: codeLenInByte = 1300
+; GFX11-PAL-GCNTRACKERS: codeLenInByte = 1372
+; GFX11-PAL: NumSgprs: 35
+; GFX11-PAL-GCNTRACKERS: NumSgprs: 35
+; GFX11-PAL: NumVgprs: 64
+; GFX11-PAL-GCNTRACKERS: NumVgprs: 64
+; GFX11-PAL: ScratchSize: 2780
+; GFX11-PAL-GCNTRACKERS: ScratchSize: 2808
+
+
+define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
+ ret <72 x i32> %val
+}
+
+define amdgpu_gfx void @call_72xi32() #1 {
+entry:
+ %ret.0 = call amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> zeroinitializer)
+ %val.0 = insertelement <72 x i32> %ret.0, i32 42, i32 0
+ %val.1 = insertelement <72 x i32> %val.0, i32 24, i32 58
+ %ret.1 = call amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val.1)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}global_extload_v16f16_to_v16f64:
+; TONGA: codeLenInByte = 420
+; TONGA-GCNTRACKERS: codeLenInByte = 436
+; TONGA: NumSgprs: 96
+; TONGA-GCNTRACKERS: NumSgprs: 96
+; TONGA: NumVgprs: 33
+; TONGA-GCNTRACKERS: NumVgprs: 25
+; TONGA: Occupancy: 7
+; TONGA-GCNTRACKERS: Occupancy: 8
+
+
+define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+ %val = load <16 x half>, ptr addrspace(1) %in
+ %cvt = fpext <16 x half> %val to <16 x double>
+ store <16 x double> %cvt, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}constant_zextload_v64i16_to_v64i32:
+; GENERIC: codeLenInByte = 860
+; GENERIC-GCNTRACKERS: codeLenInByte = 860
+; GENERIC: NumSgprs: 71
+; GENERIC-GCNTRACKERS: NumSgprs: 54
+; GENERIC: NumVgprs: 16
+; GENERIC-GCNTRACKERS: NumVgprs: 16
+; GENERIC: Occupancy: 7
+; GENERIC-GCNTRACKERS: Occupancy: 8
+
+define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
+ %load = load <64 x i16>, ptr addrspace(4) %in
+ %ext = zext <64 x i16> %load to <64 x i32>
+ store <64 x i32> %ext, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure:
+; GFX908: codeLenInByte = 1436
+; GFX908-GCNTRACKERS: codeLenInByte = 1436
+; GFX908: NumSgprs: 56
+; GFX908-GCNTRACKERS: NumSgprs: 56
+; GFX908: NumVgprs: 43
+; GFX908-GCNTRACKERS: NumVgprs: 39
+; GFX908: Occupancy: 5
+; GFX908-GCNTRACKERS: Occupancy: 6
+
+
+define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) {
+entry:
+ %i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %i2 = load i64, ptr addrspace(4) %i, align 8
+ %i3 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+ %i4 = shl i32 %i3, 8
+ %i5 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !5
+ %i6 = add i32 %i4, %i5
+ %i7 = trunc i64 %i2 to i32
+ %conv = add i32 %i6, %i7
+ %conv.frozen = freeze i32 %conv
+ %div = udiv i32 %conv.frozen, 49
+ %add.ptr22 = getelementptr inbounds float, ptr addrspace(4) %wei_ptr, i64 undef
+ %in.ptr1 = getelementptr inbounds float, ptr addrspace(1) %in, i32 %i5
+ br label %for.cond28.preheader
+
+for.cond28.preheader: ; preds = %for.cond28.preheader, %entry
+ %accum.sroa.110.0 = phi float [ 0.000000e+00, %entry ], [ %i251, %for.cond28.preheader ]
+ %accum.sroa.106.0 = phi float [ 0.000000e+00, %entry ], [ %i247, %for.cond28.preheader ]
+ %accum.sroa.102.0 = phi float [ 0.000000e+00, %entry ], [ %i243, %for.cond28.preheader ]
+ %accum.sroa.98.0 = phi float [ 0.000000e+00, %entry ], [ %i239, %for.cond28.preheader ]
+ %accum.sroa.94.0 = phi float [ 0.000000e+00, %entry ], [ %i235, %for.cond28.preheader ]
+ %accum.sroa.90.0 = phi float [ 0.000000e+00, %entry ], [ %i231, %for.cond28.preheader ]
+ %accum.sroa.86.0 = phi float [ 0.000000e+00, %entry ], [ %i227, %for.cond28.preheader ]
+ %accum.sroa.82.0 = phi float [ 0.000000e+00, %entry ], [ %i223, %for.cond28.preheader ]
+ %accum.sroa.78.0 = phi float [ 0.000000e+00, %entry ], [ %i219, %for.cond28.preheader ]
+ %accum.sroa.74.0 = phi float [ 0.000000e+00, %entry ], [ %i215, %for.cond28.preheader ]
+ %accum.sroa.70.0 = phi float [ 0.000000e+00, %entry ], [ %i211, %for.cond28.preheader ]
+ %accum.sroa.66.0 = phi float [ 0.000000e+00, %entry ], [ %i207, %for.cond28.preheader ]
+ %accum.sroa.62.0 = phi float [ 0.000000e+00, %entry ], [ %i203, %for.cond28.preheader ]
+ %accum.sroa.58.0 = phi float [ 0.000000e+00, %entry ], [ %i199, %for.cond28.preheader ]
+ %accum.sroa.54.0 = phi float [ 0.000000e+00, %entry ], [ %i195, %for.cond28.preheader ]
+ %accum.sroa.50.0 = phi float [ 0.000000e+00, %entry ], [ %i191, %for.cond28.preheader ]
+ %accum.sroa.46.0 = phi float [ 0.000000e+00, %entry ], [ %i187, %for.cond28.preheader ]
+ %accum.sroa.42.0 = phi float [ 0.000000e+00, %entry ], [ %i183, %for.cond28.preheader ]
+ %accum.sroa.38.0 = phi float [ 0.000000e+00, %entry ], [ %i179, %for.cond28.preheader ]
+ %accum.sroa.34.0 = phi float [ 0.000000e+00, %entry ], [ %i175, %for.cond28.preheader ]
+ %accum.sroa.30.0 = phi float [ 0.000000e+00, %entry ], [ %i171, %for.cond28.preheader ]
+ %accum.sroa.26.0 = phi float [ 0.000000e+00, %entry ], [ %i167, %for.cond28.preheader ]
+ %accum.sroa.22.0 = phi float [ 0.000000e+00, %entry ], [ %i163, %for.cond28.preheader ]
+ %accum.sroa.18.0 = phi float [ 0.000000e+00, %entry ], [ %i159, %for.cond28.preheader ]
+ %accum.sroa.14.0 = phi float [ 0.000000e+00, %entry ], [ %i155, %for.cond28.preheader ]
+ %accum.sroa.10.0 = phi float [ 0.000000e+00, %entry ], [ %i151, %for.cond28.preheader ]
+ %accum.sroa.6.0 = phi float [ 0.000000e+00, %entry ], [ %i147, %for.cond28.preheader ]
+ %accum.sroa.0.0 = phi float [ 0.000000e+00, %entry ], [ %i143, %for.cond28.preheader ]
+ %accum.sroa.114.0 = phi float [ 0.000000e+00, %entry ], [ %i255, %for.cond28.preheader ]
+ %accum.sroa.118.0 = phi float [ 0.000000e+00, %entry ], [ %i259, %for.cond28.preheader ]
+ %accum.sroa.122.0 = phi float [ 0.000000e+00, %entry ], [ %i263, %for.cond28.preheader ]
+ %accum.sroa.126.0 = phi float [ 0.000000e+00, %entry ], [ %i267, %for.cond28.preheader ]
+ %i_ptr.0288 = phi ptr addrspace(1) [ %in.ptr1, %entry ], [ %add.ptr47.3, %for.cond28.preheader ]
+ %w_ptr.0287 = phi ptr addrspace(4) [ %add.ptr22, %entry ], [ %add.ptr74, %for.cond28.preheader ]
+ %ci.0286 = phi i32 [ 0, %entry ], [ %inc116, %for.cond28.preheader ]
+ %i8 = load float, ptr addrspace(1) %i_ptr.0288, align 4
+ %add.ptr47 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 49
+ %i9 = load float, ptr addrspace(1) %add.ptr47, align 4
+ %add.ptr47.1 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 98
+ %i10 = load float, ptr addrspace(1) %add.ptr47.1, align 4
+ %add.ptr47.2 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 147
+ %i11 = load float, ptr addrspace(1) %add.ptr47.2, align 4
+ %i12 = load float, ptr addrspace(4) %w_ptr.0287, align 4
+ %add.ptr66 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1024
+ %i13 = load float, ptr addrspace(4) %add.ptr66, align 4
+ %add.ptr66.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2048
+ %i14 = load float, ptr addrspace(4) %add.ptr66.1, align 4
+ %add.ptr66.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3072
+ %i15 = load float, ptr addrspace(4) %add.ptr66.2, align 4
+ %add.ptr70 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1
+ %i16 = load float, ptr addrspace(4) %add.ptr70, align 4
+ %add.ptr66.1291 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1025
+ %i17 = load float, ptr addrspace(4) %add.ptr66.1291, align 4
+ %add.ptr66.1.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2049
+ %i18 = load float, ptr addrspace(4) %add.ptr66.1.1, align 4
+ %add.ptr66.2.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3073
+ %i19 = load float, ptr addrspace(4) %add.ptr66.2.1, align 4
+ %add.ptr70.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2
+ %i20 = load float, ptr addrspace(4) %add.ptr70.1, align 4
+ %add.ptr66.2293 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1026
+ %i21 = load float, ptr addrspace(4) %add.ptr66.2293, align 4
+ %add.ptr66.1.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2050
+ %i22 = load float, ptr addrspace(4) %add.ptr66.1.2, align 4
+ %add.ptr66.2.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3074
+ %i23 = load float, ptr addrspace(4) %add.ptr66.2.2, align 4
+ %add.ptr70.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3
+ %i24 = load float, ptr addrspace(4) %add.ptr70.2, align 4
+ %add.ptr66.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1027
+ %i25 = load float, ptr addrspace(4) %add.ptr66.3, align 4
+ %add.ptr66.1.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2051
+ %i26 = load float, ptr addrspace(4) %add.ptr66.1.3, align 4
+ %add.ptr66.2.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3075
+ %i27 = load float, ptr addrspace(4) %add.ptr66.2.3, align 4
+ %add.ptr70.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 4
+ %i28 = load float, ptr addrspace(4) %add.ptr70.3, align 4
+ %add.ptr66.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1028
+ %i29 = load float, ptr addrspace(4) %add.ptr66.4, align 4
+ %add.ptr66.1.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2052
+ %i30 = load float, ptr addrspace(4) %add.ptr66.1.4, align 4
+ %add.ptr66.2.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3076
+ %i31 = load float, ptr addrspace(4) %add.ptr66.2.4, align 4
+ %add.ptr70.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 5
+ %i32 = load float, ptr addrspace(4) %add.ptr70.4, align 4
+ %add.ptr66.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1029
+ %i33 = load float, ptr addrspace(4) %add.ptr66.5, align 4
+ %add.ptr66.1.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2053
+ %i34 = load float, ptr addrspace(4) %add.ptr66.1.5, align 4
+ %add.ptr66.2.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3077
+ %i35 = load float, ptr addrspace(4) %add.ptr66.2.5, align 4
+ %add.ptr70.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 6
+ %i36 = load float, ptr addrspace(4) %add.ptr70.5, align 4
+ %add.ptr66.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1030
+ %i37 = load float, ptr addrspace(4) %add.ptr66.6, align 4
+ %add.ptr66.1.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2054
+ %i38 = load float, ptr addrspace(4) %add.ptr66.1.6, align 4
+ %add.ptr66.2.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3078
+ %i39 = load float, ptr addrspace(4) %add.ptr66.2.6, align 4
+ %add.ptr70.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 7
+ %i40 = load float, ptr addrspace(4) %add.ptr70.6, align 4
+ %add.ptr66.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1031
+ %i41 = load float, ptr addrspace(4) %add.ptr66.7, align 4
+ %add.ptr66.1.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2055
+ %i42 = load float, ptr addrspace(4) %add.ptr66.1.7, align 4
+ %add.ptr66.2.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3079
+ %i43 = load float, ptr addrspace(4) %add.ptr66.2.7, align 4
+ %add.ptr70.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 8
+ %i44 = load float, ptr addrspace(4) %add.ptr70.7, align 4
+ %add.ptr66.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1032
+ %i45 = load float, ptr addrspace(4) %add.ptr66.8, align 4
+ %add.ptr66.1.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2056
+ %i46 = load float, ptr addrspace(4) %add.ptr66.1.8, align 4
+ %add.ptr66.2.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3080
+ %i47 = load float, ptr addrspace(4) %add.ptr66.2.8, align 4
+ %add.ptr70.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 9
+ %i48 = load float, ptr addrspace(4) %add.ptr70.8, align 4
+ %add.ptr66.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1033
+ %i49 = load float, ptr addrspace(4) %add.ptr66.9, align 4
+ %add.ptr66.1.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2057
+ %i50 = load float, ptr addrspace(4) %add.ptr66.1.9, align 4
+ %add.ptr66.2.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3081
+ %i51 = load float, ptr addrspace(4) %add.ptr66.2.9, align 4
+ %add.ptr70.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 10
+ %i52 = load float, ptr addrspace(4) %add.ptr70.9, align 4
+ %add.ptr66.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1034
+ %i53 = load float, ptr addrspace(4) %add.ptr66.10, align 4
+ %add.ptr66.1.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2058
+ %i54 = load float, ptr addrspace(4) %add.ptr66.1.10, align 4
+ %add.ptr66.2.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3082
+ %i55 = load float, ptr addrspace(4) %add.ptr66.2.10, align 4
+ %add.ptr70.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 11
+ %i56 = load float, ptr addrspace(4) %add.ptr70.10, align 4
+ %add.ptr66.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1035
+ %i57 = load float, ptr addrspace(4) %add.ptr66.11, align 4
+ %add.ptr66.1.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2059
+ %i58 = load float, ptr addrspace(4) %add.ptr66.1.11, align 4
+ %add.ptr66.2.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3083
+ %i59 = load float, ptr addrspace(4) %add.ptr66.2.11, align 4
+ %add.ptr70.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 12
+ %i60 = load float, ptr addrspace(4) %add.ptr70.11, align 4
+ %add.ptr66.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1036
+ %i61 = load float, ptr addrspace(4) %add.ptr66.12, align 4
+ %add.ptr66.1.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2060
+ %i62 = load float, ptr addrspace(4) %add.ptr66.1.12, align 4
+ %add.ptr66.2.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3084
+ %i63 = load float, ptr addrspace(4) %add.ptr66.2.12, align 4
+ %add.ptr70.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 13
+ %i64 = load float, ptr addrspace(4) %add.ptr70.12, align 4
+ %add.ptr66.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1037
+ %i65 = load float, ptr addrspace(4) %add.ptr66.13, align 4
+ %add.ptr66.1.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2061
+ %i66 = load float, ptr addrspace(4) %add.ptr66.1.13, align 4
+ %add.ptr66.2.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3085
+ %i67 = load float, ptr addrspace(4) %add.ptr66.2.13, align 4
+ %add.ptr70.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 14
+ %i68 = load float, ptr addrspace(4) %add.ptr70.13, align 4
+ %add.ptr66.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1038
+ %i69 = load float, ptr addrspace(4) %add.ptr66.14, align 4
+ %add.ptr66.1.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2062
+ %i70 = load float, ptr addrspace(4) %add.ptr66.1.14, align 4
+ %add.ptr66.2.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3086
+ %i71 = load float, ptr addrspace(4) %add.ptr66.2.14, align 4
+ %add.ptr70.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 15
+ %i72 = load float, ptr addrspace(4) %add.ptr70.14, align 4
+ %add.ptr66.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1039
+ %i73 = load float, ptr addrspace(4) %add.ptr66.15, align 4
+ %add.ptr66.1.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2063
+ %i74 = load float, ptr addrspace(4) %add.ptr66.1.15, align 4
+ %add.ptr66.2.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3087
+ %i75 = load float, ptr addrspace(4) %add.ptr66.2.15, align 4
+ %add.ptr70.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 16
+ %i76 = load float, ptr addrspace(4) %add.ptr70.15, align 4
+ %add.ptr66.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1040
+ %i77 = load float, ptr addrspace(4) %add.ptr66.16, align 4
+ %add.ptr66.1.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2064
+ %i78 = load float, ptr addrspace(4) %add.ptr66.1.16, align 4
+ %add.ptr66.2.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3088
+ %i79 = load float, ptr addrspace(4) %add.ptr66.2.16, align 4
+ %add.ptr70.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 17
+ %i80 = load float, ptr addrspace(4) %add.ptr70.16, align 4
+ %add.ptr66.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1041
+ %i81 = load float, ptr addrspace(4) %add.ptr66.17, align 4
+ %add.ptr66.1.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2065
+ %i82 = load float, ptr addrspace(4) %add.ptr66.1.17, align 4
+ %add.ptr66.2.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3089
+ %i83 = load float, ptr addrspace(4) %add.ptr66.2.17, align 4
+ %add.ptr70.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 18
+ %i84 = load float, ptr addrspace(4) %add.ptr70.17, align 4
+ %add.ptr66.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1042
+ %i85 = load float, ptr addrspace(4) %add.ptr66.18, align 4
+ %add.ptr66.1.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2066
+ %i86 = load float, ptr addrspace(4) %add.ptr66.1.18, align 4
+ %add.ptr66.2.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3090
+ %i87 = load float, ptr addrspace(4) %add.ptr66.2.18, align 4
+ %add.ptr70.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 19
+ %i88 = load float, ptr addrspace(4) %add.ptr70.18, align 4
+ %add.ptr66.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1043
+ %i89 = load float, ptr addrspace(4) %add.ptr66.19, align 4
+ %add.ptr66.1.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2067
+ %i90 = load float, ptr addrspace(4) %add.ptr66.1.19, align 4
+ %add.ptr66.2.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3091
+ %i91 = load float, ptr addrspace(4) %add.ptr66.2.19, align 4
+ %add.ptr70.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 20
+ %i92 = load float, ptr addrspace(4) %add.ptr70.19, align 4
+ %add.ptr66.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1044
+ %i93 = load float, ptr addrspace(4) %add.ptr66.20, align 4
+ %add.ptr66.1.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2068
+ %i94 = load float, ptr addrspace(4) %add.ptr66.1.20, align 4
+ %add.ptr66.2.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3092
+ %i95 = load float, ptr addrspace(4) %add.ptr66.2.20, align 4
+ %add.ptr70.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 21
+ %i96 = load float, ptr addrspace(4) %add.ptr70.20, align 4
+ %add.ptr66.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1045
+ %i97 = load float, ptr addrspace(4) %add.ptr66.21, align 4
+ %add.ptr66.1.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2069
+ %i98 = load float, ptr addrspace(4) %add.ptr66.1.21, align 4
+ %add.ptr66.2.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3093
+ %i99 = load float, ptr addrspace(4) %add.ptr66.2.21, align 4
+ %add.ptr70.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 22
+ %i100 = load float, ptr addrspace(4) %add.ptr70.21, align 4
+ %add.ptr66.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1046
+ %i101 = load float, ptr addrspace(4) %add.ptr66.22, align 4
+ %add.ptr66.1.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2070
+ %i102 = load float, ptr addrspace(4) %add.ptr66.1.22, align 4
+ %add.ptr66.2.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3094
+ %i103 = load float, ptr addrspace(4) %add.ptr66.2.22, align 4
+ %add.ptr70.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 23
+ %i104 = load float, ptr addrspace(4) %add.ptr70.22, align 4
+ %add.ptr66.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1047
+ %i105 = load float, ptr addrspace(4) %add.ptr66.23, align 4
+ %add.ptr66.1.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2071
+ %i106 = load float, ptr addrspace(4) %add.ptr66.1.23, align 4
+ %add.ptr66.2.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3095
+ %i107 = load float, ptr addrspace(4) %add.ptr66.2.23, align 4
+ %add.ptr70.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 24
+ %i108 = load float, ptr addrspace(4) %add.ptr70.23, align 4
+ %add.ptr66.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1048
+ %i109 = load float, ptr addrspace(4) %add.ptr66.24, align 4
+ %add.ptr66.1.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2072
+ %i110 = load float, ptr addrspace(4) %add.ptr66.1.24, align 4
+ %add.ptr66.2.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3096
+ %i111 = load float, ptr addrspace(4) %add.ptr66.2.24, align 4
+ %add.ptr70.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 25
+ %i112 = load float, ptr addrspace(4) %add.ptr70.24, align 4
+ %add.ptr66.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1049
+ %i113 = load float, ptr addrspace(4) %add.ptr66.25, align 4
+ %add.ptr66.1.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2073
+ %i114 = load float, ptr addrspace(4) %add.ptr66.1.25, align 4
+ %add.ptr66.2.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3097
+ %i115 = load float, ptr addrspace(4) %add.ptr66.2.25, align 4
+ %add.ptr70.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 26
+ %i116 = load float, ptr addrspace(4) %add.ptr70.25, align 4
+ %add.ptr66.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1050
+ %i117 = load float, ptr addrspace(4) %add.ptr66.26, align 4
+ %add.ptr66.1.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2074
+ %i118 = load float, ptr addrspace(4) %add.ptr66.1.26, align 4
+ %add.ptr66.2.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3098
+ %i119 = load float, ptr addrspace(4) %add.ptr66.2.26, align 4
+ %add.ptr70.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 27
+ %i120 = load float, ptr addrspace(4) %add.ptr70.26, align 4
+ %add.ptr66.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1051
+ %i121 = load float, ptr addrspace(4) %add.ptr66.27, align 4
+ %add.ptr66.1.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2075
+ %i122 = load float, ptr addrspace(4) %add.ptr66.1.27, align 4
+ %add.ptr66.2.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3099
+ %i123 = load float, ptr addrspace(4) %add.ptr66.2.27, align 4
+ %add.ptr70.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 28
+ %i124 = load float, ptr addrspace(4) %add.ptr70.27, align 4
+ %add.ptr66.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1052
+ %i125 = load float, ptr addrspace(4) %add.ptr66.28, align 4
+ %add.ptr66.1.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2076
+ %i126 = load float, ptr addrspace(4) %add.ptr66.1.28, align 4
+ %add.ptr66.2.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3100
+ %i127 = load float, ptr addrspace(4) %add.ptr66.2.28, align 4
+ %add.ptr70.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 29
+ %i128 = load float, ptr addrspace(4) %add.ptr70.28, align 4
+ %add.ptr66.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1053
+ %i129 = load float, ptr addrspace(4) %add.ptr66.29, align 4
+ %add.ptr66.1.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2077
+ %i130 = load float, ptr addrspace(4) %add.ptr66.1.29, align 4
+ %add.ptr66.2.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3101
+ %i131 = load float, ptr addrspace(4) %add.ptr66.2.29, align 4
+ %add.ptr70.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 30
+ %i132 = load float, ptr addrspace(4) %add.ptr70.29, align 4
+ %add.ptr66.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1054
+ %i133 = load float, ptr addrspace(4) %add.ptr66.30, align 4
+ %add.ptr66.1.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2078
+ %i134 = load float, ptr addrspace(4) %add.ptr66.1.30, align 4
+ %add.ptr66.2.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3102
+ %i135 = load float, ptr addrspace(4) %add.ptr66.2.30, align 4
+ %add.ptr70.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 31
+ %i136 = load float, ptr addrspace(4) %add.ptr70.30, align 4
+ %add.ptr66.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1055
+ %i137 = load float, ptr addrspace(4) %add.ptr66.31, align 4
+ %add.ptr66.1.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2079
+ %i138 = load float, ptr addrspace(4) %add.ptr66.1.31, align 4
+ %add.ptr66.2.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3103
+ %i139 = load float, ptr addrspace(4) %add.ptr66.2.31, align 4
+ %add.ptr47.3 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 196
+ %i140 = tail call float @llvm.fmuladd.f32(float %i8, float %i12, float %accum.sroa.0.0)
+ %i141 = tail call float @llvm.fmuladd.f32(float %i9, float %i13, float %i140)
+ %i142 = tail call float @llvm.fmuladd.f32(float %i10, float %i14, float %i141)
+ %i143 = tail call float @llvm.fmuladd.f32(float %i11, float %i15, float %i142)
+ %i144 = tail call float @llvm.fmuladd.f32(float %i8, float %i16, float %accum.sroa.6.0)
+ %i145 = tail call float @llvm.fmuladd.f32(float %i9, float %i17, float %i144)
+ %i146 = tail call float @llvm.fmuladd.f32(float %i10, float %i18, float %i145)
+ %i147 = tail call float @llvm.fmuladd.f32(float %i11, float %i19, float %i146)
+ %i148 = tail call float @llvm.fmuladd.f32(float %i8, float %i20, float %accum.sroa.10.0)
+ %i149 = tail call float @llvm.fmuladd.f32(float %i9, float %i21, float %i148)
+ %i150 = tail call float @llvm.fmuladd.f32(float %i10, float %i22, float %i149)
+ %i151 = tail call float @llvm.fmuladd.f32(float %i11, float %i23, float %i150)
+ %i152 = tail call float @llvm.fmuladd.f32(float %i8, float %i24, float %accum.sroa.14.0)
+ %i153 = tail call float @llvm.fmuladd.f32(float %i9, float %i25, float %i152)
+ %i154 = tail call float @llvm.fmuladd.f32(float %i10, float %i26, float %i153)
+ %i155 = tail call float @llvm.fmuladd.f32(float %i11, float %i27, float %i154)
+ %i156 = tail call float @llvm.fmuladd.f32(float %i8, float %i28, float %accum.sroa.18.0)
+ %i157 = tail call float @llvm.fmuladd.f32(float %i9, float %i29, float %i156)
+ %i158 = tail call float @llvm.fmuladd.f32(float %i10, float %i30, float %i157)
+ %i159 = tail call float @llvm.fmuladd.f32(float %i11, float %i31, float %i158)
+ %i160 = tail call float @llvm.fmuladd.f32(float %i8, float %i32, float %accum.sroa.22.0)
+ %i161 = tail call float @llvm.fmuladd.f32(float %i9, float %i33, float %i160)
+ %i162 = tail call float @llvm.fmuladd.f32(float %i10, float %i34, float %i161)
+ %i163 = tail call float @llvm.fmuladd.f32(float %i11, float %i35, float %i162)
+ %i164 = tail call float @llvm.fmuladd.f32(float %i8, float %i36, float %accum.sroa.26.0)
+ %i165 = tail call float @llvm.fmuladd.f32(float %i9, float %i37, float %i164)
+ %i166 = tail call float @llvm.fmuladd.f32(float %i10, float %i38, float %i165)
+ %i167 = tail call float @llvm.fmuladd.f32(float %i11, float %i39, float %i166)
+ %i168 = tail call float @llvm.fmuladd.f32(float %i8, float %i40, float %accum.sroa.30.0)
+ %i169 = tail call float @llvm.fmuladd.f32(float %i9, float %i41, float %i168)
+ %i170 = tail call float @llvm.fmuladd.f32(float %i10, float %i42, float %i169)
+ %i171 = tail call float @llvm.fmuladd.f32(float %i11, float %i43, float %i170)
+ %i172 = tail call float @llvm.fmuladd.f32(float %i8, float %i44, float %accum.sroa.34.0)
+ %i173 = tail call float @llvm.fmuladd.f32(float %i9, float %i45, float %i172)
+ %i174 = tail call float @llvm.fmuladd.f32(float %i10, float %i46, float %i173)
+ %i175 = tail call float @llvm.fmuladd.f32(float %i11, float %i47, float %i174)
+ %i176 = tail call float @llvm.fmuladd.f32(float %i8, float %i48, float %accum.sroa.38.0)
+ %i177 = tail call float @llvm.fmuladd.f32(float %i9, float %i49, float %i176)
+ %i178 = tail call float @llvm.fmuladd.f32(float %i10, float %i50, float %i177)
+ %i179 = tail call float @llvm.fmuladd.f32(float %i11, float %i51, float %i178)
+ %i180 = tail call float @llvm.fmuladd.f32(float %i8, float %i52, float %accum.sroa.42.0)
+ %i181 = tail call float @llvm.fmuladd.f32(float %i9, float %i53, float %i180)
+ %i182 = tail call float @llvm.fmuladd.f32(float %i10, float %i54, float %i181)
+ %i183 = tail call float @llvm.fmuladd.f32(float %i11, float %i55, float %i182)
+ %i184 = tail call float @llvm.fmuladd.f32(float %i8, float %i56, float %accum.sroa.46.0)
+ %i185 = tail call float @llvm.fmuladd.f32(float %i9, float %i57, float %i184)
+ %i186 = tail call float @llvm.fmuladd.f32(float %i10, float %i58, float %i185)
+ %i187 = tail call float @llvm.fmuladd.f32(float %i11, float %i59, float %i186)
+ %i188 = tail call float @llvm.fmuladd.f32(float %i8, float %i60, float %accum.sroa.50.0)
+ %i189 = tail call float @llvm.fmuladd.f32(float %i9, float %i61, float %i188)
+ %i190 = tail call float @llvm.fmuladd.f32(float %i10, float %i62, float %i189)
+ %i191 = tail call float @llvm.fmuladd.f32(float %i11, float %i63, float %i190)
+ %i192 = tail call float @llvm.fmuladd.f32(float %i8, float %i64, float %accum.sroa.54.0)
+ %i193 = tail call float @llvm.fmuladd.f32(float %i9, float %i65, float %i192)
+ %i194 = tail call float @llvm.fmuladd.f32(float %i10, float %i66, float %i193)
+ %i195 = tail call float @llvm.fmuladd.f32(float %i11, float %i67, float %i194)
+ %i196 = tail call float @llvm.fmuladd.f32(float %i8, float %i68, float %accum.sroa.58.0)
+ %i197 = tail call float @llvm.fmuladd.f32(float %i9, float %i69, float %i196)
+ %i198 = tail call float @llvm.fmuladd.f32(float %i10, float %i70, float %i197)
+ %i199 = tail call float @llvm.fmuladd.f32(float %i11, float %i71, float %i198)
+ %i200 = tail call float @llvm.fmuladd.f32(float %i8, float %i72, float %accum.sroa.62.0)
+ %i201 = tail call float @llvm.fmuladd.f32(float %i9, float %i73, float %i200)
+ %i202 = tail call float @llvm.fmuladd.f32(float %i10, float %i74, float %i201)
+ %i203 = tail call float @llvm.fmuladd.f32(float %i11, float %i75, float %i202)
+ %i204 = tail call float @llvm.fmuladd.f32(float %i8, float %i76, float %accum.sroa.66.0)
+ %i205 = tail call float @llvm.fmuladd.f32(float %i9, float %i77, float %i204)
+ %i206 = tail call float @llvm.fmuladd.f32(float %i10, float %i78, float %i205)
+ %i207 = tail call float @llvm.fmuladd.f32(float %i11, float %i79, float %i206)
+ %i208 = tail call float @llvm.fmuladd.f32(float %i8, float %i80, float %accum.sroa.70.0)
+ %i209 = tail call float @llvm.fmuladd.f32(float %i9, float %i81, float %i208)
+ %i210 = tail call float @llvm.fmuladd.f32(float %i10, float %i82, float %i209)
+ %i211 = tail call float @llvm.fmuladd.f32(float %i11, float %i83, float %i210)
+ %i212 = tail call float @llvm.fmuladd.f32(float %i8, float %i84, float %accum.sroa.74.0)
+ %i213 = tail call float @llvm.fmuladd.f32(float %i9, float %i85, float %i212)
+ %i214 = tail call float @llvm.fmuladd.f32(float %i10, float %i86, float %i213)
+ %i215 = tail call float @llvm.fmuladd.f32(float %i11, float %i87, float %i214)
+ %i216 = tail call float @llvm.fmuladd.f32(float %i8, float %i88, float %accum.sroa.78.0)
+ %i217 = tail call float @llvm.fmuladd.f32(float %i9, float %i89, float %i216)
+ %i218 = tail call float @llvm.fmuladd.f32(float %i10, float %i90, float %i217)
+ %i219 = tail call float @llvm.fmuladd.f32(float %i11, float %i91, float %i218)
+ %i220 = tail call float @llvm.fmuladd.f32(float %i8, float %i92, float %accum.sroa.82.0)
+ %i221 = tail call float @llvm.fmuladd.f32(float %i9, float %i93, float %i220)
+ %i222 = tail call float @llvm.fmuladd.f32(float %i10, float %i94, float %i221)
+ %i223 = tail call float @llvm.fmuladd.f32(float %i11, float %i95, float %i222)
+ %i224 = tail call float @llvm.fmuladd.f32(float %i8, float %i96, float %accum.sroa.86.0)
+ %i225 = tail call float @llvm.fmuladd.f32(float %i9, float %i97, float %i224)
+ %i226 = tail call float @llvm.fmuladd.f32(float %i10, float %i98, float %i225)
+ %i227 = tail call float @llvm.fmuladd.f32(float %i11, float %i99, float %i226)
+ %i228 = tail call float @llvm.fmuladd.f32(float %i8, float %i100, float %accum.sroa.90.0)
+ %i229 = tail call float @llvm.fmuladd.f32(float %i9, float %i101, float %i228)
+ %i230 = tail call float @llvm.fmuladd.f32(float %i10, float %i102, float %i229)
+ %i231 = tail call float @llvm.fmuladd.f32(float %i11, float %i103, float %i230)
+ %i232 = tail call float @llvm.fmuladd.f32(float %i8, float %i104, float %accum.sroa.94.0)
+ %i233 = tail call float @llvm.fmuladd.f32(float %i9, float %i105, float %i232)
+ %i234 = tail call float @llvm.fmuladd.f32(float %i10, float %i106, float %i233)
+ %i235 = tail call float @llvm.fmuladd.f32(float %i11, float %i107, float %i234)
+ %i236 = tail call float @llvm.fmuladd.f32(float %i8, float %i108, float %accum.sroa.98.0)
+ %i237 = tail call float @llvm.fmuladd.f32(float %i9, float %i109, float %i236)
+ %i238 = tail call float @llvm.fmuladd.f32(float %i10, float %i110, float %i237)
+ %i239 = tail call float @llvm.fmuladd.f32(float %i11, float %i111, float %i238)
+ %i240 = tail call float @llvm.fmuladd.f32(float %i8, float %i112, float %accum.sroa.102.0)
+ %i241 = tail call float @llvm.fmuladd.f32(float %i9, float %i113, float %i240)
+ %i242 = tail call float @llvm.fmuladd.f32(float %i10, float %i114, float %i241)
+ %i243 = tail call float @llvm.fmuladd.f32(float %i11, float %i115, float %i242)
+ %i244 = tail call float @llvm.fmuladd.f32(float %i8, float %i116, float %accum.sroa.106.0)
+ %i245 = tail call float @llvm.fmuladd.f32(float %i9, float %i117, float %i244)
+ %i246 = tail call float @llvm.fmuladd.f32(float %i10, float %i118, float %i245)
+ %i247 = tail call float @llvm.fmuladd.f32(float %i11, float %i119, float %i246)
+ %i248 = tail call float @llvm.fmuladd.f32(float %i8, float %i120, float %accum.sroa.110.0)
+ %i249 = tail call float @llvm.fmuladd.f32(float %i9, float %i121, float %i248)
+ %i250 = tail call float @llvm.fmuladd.f32(float %i10, float %i122, float %i249)
+ %i251 = tail call float @llvm.fmuladd.f32(float %i11, float %i123, float %i250)
+ %i252 = tail call float @llvm.fmuladd.f32(float %i8, float %i124, float %accum.sroa.114.0)
+ %i253 = tail call float @llvm.fmuladd.f32(float %i9, float %i125, float %i252)
+ %i254 = tail call float @llvm.fmuladd.f32(float %i10, float %i126, float %i253)
+ %i255 = tail call float @llvm.fmuladd.f32(float %i11, float %i127, float %i254)
+ %i256 = tail call float @llvm.fmuladd.f32(float %i8, float %i128, float %accum.sroa.118.0)
+ %i257 = tail call float @llvm.fmuladd.f32(float %i9, float %i129, float %i256)
+ %i258 = tail call float @llvm.fmuladd.f32(float %i10, float %i130, float %i257)
+ %i259 = tail call float @llvm.fmuladd.f32(float %i11, float %i131, float %i258)
+ %i260 = tail call float @llvm.fmuladd.f32(float %i8, float %i132, float %accum.sroa.122.0)
+ %i261 = tail call float @llvm.fmuladd.f32(float %i9, float %i133, float %i260)
+ %i262 = tail call float @llvm.fmuladd.f32(float %i10, float %i134, float %i261)
+ %i263 = tail call float @llvm.fmuladd.f32(float %i11, float %i135, float %i262)
+ %i264 = tail call float @llvm.fmuladd.f32(float %i8, float %i136, float %accum.sroa.126.0)
+ %i265 = tail call float @llvm.fmuladd.f32(float %i9, float %i137, float %i264)
+ %i266 = tail call float @llvm.fmuladd.f32(float %i10, float %i138, float %i265)
+ %i267 = tail call float @llvm.fmuladd.f32(float %i11, float %i139, float %i266)
+ %add.ptr74 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 4096
+ %inc116 = add nuw nsw i32 %ci.0286, 1
+ %exitcond.not = icmp eq i32 %inc116, 512
+ br i1 %exitcond.not, label %for.cond.cleanup26, label %for.cond28.preheader
+
+for.cond.cleanup26: ; preds = %for.cond28.preheader
+ %mul119 = shl nuw nsw i32 undef, 1
+ %mul120 = mul i32 %div, 200704
+ %mul121 = mul i32 undef, 6272
+ %add122 = add i32 %mul120, %mul121
+ %mul123 = mul nuw nsw i32 undef, 28
+ %add124 = add i32 %add122, %mul123
+ %add126 = add i32 %add124, %mul119
+ %idx.ext127 = zext i32 %add126 to i64
+ %add.ptr128 = getelementptr inbounds float, ptr addrspace(1) %out_ptr, i64 %idx.ext127
+ store float %i143, ptr addrspace(1) %add.ptr128, align 4
+ %add.ptr184 = getelementptr inbounds float, ptr addrspace(1) %add.ptr128, i64 196
+ store float %i147, ptr addrspace(1) %add.ptr184, align 4
+ %add.ptr167.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184, i64 14
+ store float 0.000000e+00, ptr addrspace(1) %add.ptr167.1, align 4
+ %add.ptr175.1.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr167.1, i64 1
+ store float 0.000000e+00, ptr addrspace(1) %add.ptr175.1.1, align 4
+ %add.ptr184.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184, i64 196
+ store float %i151, ptr addrspace(1) %add.ptr184.1, align 4
+ %add.ptr184.2 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.1, i64 196
+ store float %i155, ptr addrspace(1) %add.ptr184.2, align 4
+ %add.ptr184.3 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.2, i64 196
+ store float %i159, ptr addrspace(1) %add.ptr184.3, align 4
+ %add.ptr184.4 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.3, i64 196
+ store float %i163, ptr addrspace(1) %add.ptr184.4, align 4
+ %add.ptr154.5 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.4, i64 1
+ store float 0.000000e+00, ptr addrspace(1) %add.ptr154.5, align 4
+ %add.ptr184.5 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.4, i64 196
+ store float %i167, ptr addrspace(1) %add.ptr184.5, align 4
+ %add.ptr154.6 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.5, i64 1
+ store float 0.000000e+00, ptr addrspace(1) %add.ptr154.6, align 4
+ %add.ptr184.6 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.5, i64 196
+ store float %i171, ptr addrspace(1) %add.ptr184.6, align 4
+ %add.ptr184.7 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.6, i64 196
+ store float %i175, ptr addrspace(1) %add.ptr184.7, align 4
+ %add.ptr167.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.7, i64 14
+ store float 0.000000e+00, ptr addrspace(1) %add.ptr167.8, align 4
+ %add.ptr175.1.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr167.8, i64 1
+ store float 0.000000e+00, ptr addrspace(1) %add.ptr175.1.8, align 4
+ %add.ptr184.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.7, i64 196
+ store float %i179, ptr addrspace(1) %add.ptr184.8, align 4
+ %add.ptr184.9 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.8, i64 196
+ store float %i183, ptr addrspace(1) %add.ptr184.9, align 4
+ %add.ptr184.10 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.9, i64 196
+ store float %i187, ptr addrspace(1) %add.ptr184.10, align 4
+ %add.ptr184.11 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.10, i64 196
+ store float %i191, ptr addrspace(1) %add.ptr184.11, align 4
+ %add.ptr184.12 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.11, i64 196
+ store float %i195, ptr addrspace(1) %add.ptr184.12, align 4
+ %add.ptr184.13 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.12, i64 196
+ store float %i199, ptr addrspace(1) %add.ptr184.13, align 4
+ %add.ptr184.14 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.13, i64 196
+ store float %i203, ptr addrspace(1) %add.ptr184.14, align 4
+ %add.ptr184.15 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.14, i64 196
+ store float %i207, ptr addrspace(1) %add.ptr184.15, align 4
+ %add.ptr184.16 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.15, i64 196
+ store float %i211, ptr addrspace(1) %add.ptr184.16, align 4
+ %add.ptr184.17 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.16, i64 196
+ store float %i215, ptr addrspace(1) %add.ptr184.17, align 4
+ %add.ptr184.18 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.17, i64 196
+ store float %i219, ptr addrspace(1) %add.ptr184.18, align 4
+ %add.ptr184.19 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.18, i64 196
+ store float %i223, ptr addrspace(1) %add.ptr184.19, align 4
+ %add.ptr184.20 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.19, i64 196
+ store float %i227, ptr addrspace(1) %add.ptr184.20, align 4
+ %add.ptr184.21 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.20, i64 196
+ store float %i231, ptr addrspace(1) %add.ptr184.21, align 4
+ %add.ptr184.22 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.21, i64 196
+ store float %i235, ptr addrspace(1) %add.ptr184.22, align 4
+ %add.ptr184.23 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.22, i64 196
+ store float %i239, ptr addrspace(1) %add.ptr184.23, align 4
+ %add.ptr184.24 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.23, i64 196
+ store float %i243, ptr addrspace(1) %add.ptr184.24, align 4
+ %add.ptr184.25 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.24, i64 196
+ store float %i247, ptr addrspace(1) %add.ptr184.25, align 4
+ %add.ptr184.26 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.25, i64 196
+ store float %i251, ptr addrspace(1) %add.ptr184.26, align 4
+ %add.ptr184.27 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.26, i64 196
+ store float %i255, ptr addrspace(1) %add.ptr184.27, align 4
+ %add.ptr184.28 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.27, i64 196
+ store float %i259, ptr addrspace(1) %add.ptr184.28, align 4
+ %add.ptr184.29 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.28, i64 196
+ store float %i263, ptr addrspace(1) %add.ptr184.29, align 4
+ %add.ptr184.30 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.29, i64 196
+ store float %i267, ptr addrspace(1) %add.ptr184.30, align 4
+ ret void
+}
+
+
+
+declare float @llvm.fmuladd.f32(float, float, float) #2
+declare i32 @llvm.amdgcn.workitem.id.x() #3
+declare i32 @llvm.amdgcn.workgroup.id.x() #3
+declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3
+
+!0 = !{i32 1, i32 2, i32 1, i32 0}
+!1 = !{!"none", !"none", !"none", !"none"}
+!2 = !{!"ptr", !"ptr", !"ptr", !"float"}
+!3 = !{!"restrict const", !"restrict const", !"restrict", !""}
+!4 = !{i32 256, i32 1, i32 1}
+!5 = !{i32 0, i32 1024}
+
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1" }
+attributes #1 = { nounwind "amdgpu-num-vgpr"="64" }
+attributes #2 = { nofree nosync nounwind readnone speculatable willreturn }
+attributes #3 = { nounwind readnone speculatable willreturn }
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
index 14bb4310c619ea..3ce6279f9082fb 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
@@ -1,4 +1,5 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,1 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
--- |
define amdgpu_kernel void @no_sched_metric_due_to_spills() #0 {
@@ -11,6 +12,20 @@
# GCN-LABEL: name: no_sched_metric_due_to_spills
# GCN-NOT: SI_SPILL_
# GCN: S_ENDPGM
+
+# GCN-GCNTRACKER-LABEL: name: no_sched_metric_due_to_spills
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: SI_SPILL_V32_SAVE
+# GCN-GCNTRACKER: S_ENDPGM
+
+# When using the GCN Trackers, the scheduler is able to acieve desired occupancy without running high-RP-reschedule stage. However, the RP is still high,
+# and RA is unable to allocate without spills. By running the high-RP-reschedule schedule we would have furhter decreased RP, which provides increased
+# flexibility for RA.
+
---
name: no_sched_metric_due_to_spills
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
index 94815558bf3d6d..71f8d91874f04f 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll
@@ -1,16 +1,24 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=OCC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs < %s | FileCheck --check-prefix=OCC-GCNTRACKER %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs -amdgpu-schedule-relaxed-occupancy=true < %s | FileCheck --check-prefix=RELAX %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -amdgpu-schedule-relaxed-occupancy=true < %s | FileCheck --check-prefix=RELAX-GCNTRACKER %s
; Using -amgpu-schedule-relaxed-occupancy allows scheduler to produce better ILP by further relaxing occupancy target
-; GCN-LABEL: {{^}}load_fma_store:
+; CHECK-LABEL: {{^}}load_fma_store:
; OCC: NumVgprs: 32
+; OCC-GCNTRACKER: NumVgprs: 24
; RELAX: NumVgprs: 64
+; RELAX-GCNTRACKER: NumVgprs: 60
; OCC: NumVGPRsForWavesPerEU: 32
+; OCC-GCNTRACKER: NumVGPRsForWavesPerEU: 24
; RELAX: NumVGPRsForWavesPerEU: 64
+; RELAX-GCNTRACKER: NumVGPRsForWavesPerEU: 60
; OCC: Occupancy: 8
+; OCC-GCNTRACKER: Occupancy: 8
; RELAX: Occupancy: 4
+; RELAX-GCNTRACKER: Occupancy: 4
define amdgpu_kernel void @load_fma_store(ptr addrspace(3) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) #1 {
bb:
>From 7ee4ffdb98697d661420901055782ccc607565f5 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 27 Sep 2024 12:40:02 -0700
Subject: [PATCH 22/24] Remove CurrLIS
Change-Id: I228916bf04add1de7615294d1e58ee4213f0bbde
---
llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 21 ++++++++-------------
llvm/lib/Target/AMDGPU/GCNRegPressure.h | 10 ++++------
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 2 +-
3 files changed, 13 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index fb92924363d43b..888b5907a979e7 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -595,26 +595,22 @@ bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
}
bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI,
- bool UseInternalIterator,
- LiveIntervals *TheLIS) {
+ bool UseInternalIterator) {
assert(MRI && "call reset first");
SlotIndex SI;
- const LiveIntervals *CurrLIS;
const MachineInstr *CurrMI;
if (UseInternalIterator) {
if (!LastTrackedMI)
return NextMI == MBBEnd;
assert(NextMI == MBBEnd || !NextMI->isDebugInstr());
- CurrLIS = &LIS;
CurrMI = LastTrackedMI;
SI = NextMI == MBBEnd
- ? CurrLIS->getInstructionIndex(*LastTrackedMI).getDeadSlot()
- : CurrLIS->getInstructionIndex(*NextMI).getBaseIndex();
+ ? LIS.getInstructionIndex(*LastTrackedMI).getDeadSlot()
+ : LIS.getInstructionIndex(*NextMI).getBaseIndex();
} else { //! UseInternalIterator
- CurrLIS = TheLIS;
- SI = CurrLIS->getInstructionIndex(*MI).getBaseIndex();
+ SI = LIS.getInstructionIndex(*MI).getBaseIndex();
CurrMI = MI;
}
@@ -631,7 +627,7 @@ bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI,
continue;
if (!SeenRegs.insert(MO.getReg()).second)
continue;
- const LiveInterval &LI = CurrLIS->getInterval(MO.getReg());
+ const LiveInterval &LI = LIS.getInterval(MO.getReg());
if (LI.hasSubRanges()) {
auto It = LiveRegs.end();
for (const auto &S : LI.subranges()) {
@@ -689,16 +685,15 @@ void GCNDownwardRPTracker::advanceToNext(MachineInstr *MI,
MaxPressure = max(MaxPressure, CurPressure);
}
-bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator,
- LiveIntervals *TheLIS) {
+bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator) {
if (UseInternalIterator && NextMI == MBBEnd)
return false;
- advanceBeforeNext(MI, UseInternalIterator, TheLIS);
+ advanceBeforeNext(MI, UseInternalIterator);
advanceToNext(MI, UseInternalIterator);
if (!UseInternalIterator) {
// We must remove any dead def lanes from the current RP
- advanceBeforeNext(MI, true, TheLIS);
+ advanceBeforeNext(MI, true);
}
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 463da472bb69ff..169c2e42c08054 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -280,10 +280,9 @@ class GCNDownwardRPTracker : public GCNRPTracker {
/// it is assumed that the tracker is using an externally managed iterator,
/// and advance* calls will not update the state of the iterator. In such
/// cases, the tracker will move to the state right before the provided \p MI
- /// and use the provided \p TheLIS for RP calculations.
+ /// and use LIS for RP calculations.
bool advanceBeforeNext(MachineInstr *MI = nullptr,
- bool UseInternalIterator = true,
- LiveIntervals *TheLIS = nullptr);
+ bool UseInternalIterator = true);
/// Move to the state at the MI, advanceBeforeNext has to be called first.
/// If \p UseInternalIterator is true, then internal iterators are used and
@@ -300,9 +299,8 @@ class GCNDownwardRPTracker : public GCNRPTracker {
/// then it is assumed that the tracker is using an externally managed
/// iterator, and advance* calls will not update the state of the iterator. In
/// such cases, the tracker will move to the state right before the provided
- /// \p MI and use the provided \p TheLIS for RP calculations.
- bool advance(MachineInstr *MI = nullptr, bool UseInternalIterator = true,
- LiveIntervals *TheLIS = nullptr);
+ /// \p MI and use LIS for RP calculations.
+ bool advance(MachineInstr *MI = nullptr, bool UseInternalIterator = true);
/// Advance instructions until before \p End.
bool advance(MachineBasicBlock::const_iterator End);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 28ca41d2dc96ed..b47cdb2e7ddcf1 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -490,7 +490,7 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
if (GCNTrackers) {
MachineInstr *MI = SU->getInstr();
- IsTopNode ? (void)DownwardTracker.advance(MI, false, DAG->getLIS())
+ IsTopNode ? (void)DownwardTracker.advance(MI, false)
: UpwardTracker.recede(*MI);
}
>From bf61d05d7a7af61f4f6d9c3452f1f817f84f548b Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 3 Oct 2024 13:42:41 -0700
Subject: [PATCH 23/24] Mark speculative query methods as const
Change-Id: I9ebe0cf7252068dcee90d419945085efae75547d
---
llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 55 +++++++++------------
llvm/lib/Target/AMDGPU/GCNRegPressure.h | 6 ++-
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 21 +++-----
3 files changed, 33 insertions(+), 49 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 888b5907a979e7..a7a3c65c3388b3 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -448,19 +448,6 @@ void GCNRPTracker::reset(const MachineRegisterInfo &MRI_,
MaxPressure = CurPressure = getRegPressure(MRI_, LiveRegs_);
}
-void GCNRPTracker::bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs) {
- GCNRegPressure TempPressure = CurPressure;
- for (const RegisterMaskPair &P : DeadDefs) {
- Register Reg = P.RegUnit;
- if (!Reg.isVirtual())
- continue;
- LaneBitmask LiveMask = LiveRegs[Reg];
- LaneBitmask BumpedMask = LiveMask | P.LaneMask;
- CurPressure.inc(Reg, LiveMask, BumpedMask, *MRI);
- }
- MaxPressure = max(MaxPressure, CurPressure);
- CurPressure = TempPressure;
-}
/// Mostly copy/paste from CodeGen/RegisterPressure.cpp
LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit,
SlotIndex Pos) const {
@@ -535,8 +522,9 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
assert(CurPressure == getRegPressure(*MRI, LiveRegs));
}
-void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI,
- const SIRegisterInfo *TRI) {
+GCNRegPressure
+GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI,
+ const SIRegisterInfo *TRI) const {
assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
SlotIndex SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot();
@@ -549,33 +537,32 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI,
adjustDefLaneLiveness(RegOpers.Defs, SlotIdx, LIS, *MRI);
RegOpers.detectDeadDefs(*MI, LIS);
- // Boost max pressure for all dead defs together.
- // Since CurrSetPressure and MaxSetPressure
- bumpDeadDefs(RegOpers.DeadDefs);
+ GCNRegPressure TempPressure = CurPressure;
// Kill liveness at live defs.
for (const RegisterMaskPair &P : RegOpers.Defs) {
Register Reg = P.RegUnit;
if (!Reg.isVirtual())
continue;
- LaneBitmask LiveAfter = LiveRegs[Reg];
+ LaneBitmask LiveAfter =
+ LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0);
LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, Reg);
LaneBitmask DefLanes = P.LaneMask;
LaneBitmask LiveBefore = (LiveAfter & ~DefLanes) | UseLanes;
- CurPressure.inc(Reg, LiveAfter, LiveAfter & LiveBefore, *MRI);
- MaxPressure = max(MaxPressure, CurPressure);
+ TempPressure.inc(Reg, LiveAfter, LiveAfter & LiveBefore, *MRI);
}
// Generate liveness for uses.
for (const RegisterMaskPair &P : RegOpers.Uses) {
Register Reg = P.RegUnit;
if (!Reg.isVirtual())
continue;
- LaneBitmask LiveAfter = LiveRegs[Reg];
+ LaneBitmask LiveAfter =
+ LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0);
LaneBitmask LiveBefore = LiveAfter | P.LaneMask;
- CurPressure.inc(Reg, LiveAfter, LiveBefore, *MRI);
+ TempPressure.inc(Reg, LiveAfter, LiveBefore, *MRI);
}
- MaxPressure = max(MaxPressure, CurPressure);
+ return TempPressure;
}
////////////////////////////////////////////////////////////////////////////////
@@ -736,8 +723,9 @@ Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
});
}
-void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
- const SIRegisterInfo *TRI) {
+GCNRegPressure
+GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
+ const SIRegisterInfo *TRI) const {
assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
SlotIndex SlotIdx;
@@ -747,6 +735,7 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
RegisterOperands RegOpers;
RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/false);
RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx);
+ GCNRegPressure TempPressure = CurPressure;
for (const RegisterMaskPair &Use : RegOpers.Uses) {
Register Reg = Use.RegUnit;
@@ -775,9 +764,10 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
if (LastUseMask.none())
continue;
- LaneBitmask LiveMask = LiveRegs[Reg];
+ LaneBitmask LiveMask =
+ LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0);
LaneBitmask NewMask = LiveMask & ~LastUseMask;
- CurPressure.inc(Reg, LiveMask, NewMask, *MRI);
+ TempPressure.inc(Reg, LiveMask, NewMask, *MRI);
}
// Generate liveness for defs.
@@ -785,14 +775,13 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
Register Reg = Def.RegUnit;
if (!Reg.isVirtual())
continue;
- LaneBitmask LiveMask = LiveRegs[Reg];
+ LaneBitmask LiveMask =
+ LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0);
LaneBitmask NewMask = LiveMask | Def.LaneMask;
- CurPressure.inc(Reg, LiveMask, NewMask, *MRI);
+ TempPressure.inc(Reg, LiveMask, NewMask, *MRI);
}
- MaxPressure = max(MaxPressure, CurPressure);
- // Boost pressure for all dead defs together.
- bumpDeadDefs(RegOpers.DeadDefs);
+ return TempPressure;
}
bool GCNUpwardRPTracker::isValid() const {
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 169c2e42c08054..a583efb457aea6 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -228,7 +228,8 @@ class GCNUpwardRPTracker : public GCNRPTracker {
/// does not rely on the implicit program ordering in the LiveIntervals to
/// support RP Speculation. It leaves the state of pressure inconsistent with
/// the current position
- void bumpUpwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI);
+ GCNRegPressure bumpUpwardPressure(const MachineInstr *MI,
+ const SIRegisterInfo *TRI) const;
/// \p returns whether the tracker's state after receding MI corresponds
/// to reported by LIS.
@@ -315,7 +316,8 @@ class GCNDownwardRPTracker : public GCNRPTracker {
/// does not rely on the implicit program ordering in the LiveIntervals to
/// support RP Speculation. It leaves the state of pressure inconsistent with
/// the current position
- void bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI);
+ GCNRegPressure bumpDownwardPressure(const MachineInstr *MI,
+ const SIRegisterInfo *TRI) const;
};
LaneBitmask getLiveLaneMask(unsigned Reg,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index b47cdb2e7ddcf1..e28acd4c07beb6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -173,25 +173,18 @@ static void getRegisterPressures(
// GCNTrackers
Pressure.resize(4, 0);
MachineInstr *MI = SU->getInstr();
+ GCNRegPressure NewPressure;
if (AtTop) {
GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
- TempDownwardTracker.bumpDownwardPressure(MI, SRI);
- Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
- TempDownwardTracker.getPressure().getSGPRNum();
- Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
- TempDownwardTracker.getPressure().getArchVGPRNum();
- Pressure[AMDGPU::RegisterPressureSets::AGPR_32] =
- TempDownwardTracker.getPressure().getAGPRNum();
+ NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
} else {
GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
- TempUpwardTracker.bumpUpwardPressure(MI, SRI);
- Pressure[AMDGPU::RegisterPressureSets::SReg_32] =
- TempUpwardTracker.getPressure().getSGPRNum();
- Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
- TempUpwardTracker.getPressure().getArchVGPRNum();
- Pressure[AMDGPU::RegisterPressureSets::AGPR_32] =
- TempUpwardTracker.getPressure().getAGPRNum();
+ NewPressure = TempUpwardTracker.bumpUpwardPressure(MI, SRI);
}
+ Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
+ Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
+ NewPressure.getArchVGPRNum();
+ Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
}
void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
>From 62d058e522f9a5edb9d599cf718772f6e10c3cbe Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Sun, 6 Oct 2024 15:43:51 -0700
Subject: [PATCH 24/24] Fix lit tests
Change-Id: Ie204904f04dc9d2f53d586795c886a3f8c6b1268
---
llvm/test/CodeGen/AMDGPU/pr51516.mir | 4 ++--
.../CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir
index 49dd5c6c39ff5c..f496a4b06bb237 100644
--- a/llvm/test/CodeGen/AMDGPU/pr51516.mir
+++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
# Check that %3 was not rematerialized before the last store since its operand %1
# is killed by that store.
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
index 3ce6279f9082fb..34d203e0de2ffa 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,1 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
--- |
define amdgpu_kernel void @no_sched_metric_due_to_spills() #0 {
More information about the llvm-commits
mailing list