[llvm] [CodeGen][MachinePipeliner] Limit register pressure when scheduling (PR #74807)
Ryotaro KASUGA via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 17 02:33:47 PST 2024
https://github.com/kasuga-fj updated https://github.com/llvm/llvm-project/pull/74807
>From 79336861496c85be26cecfc58dbd32f419372796 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Fri, 8 Dec 2023 14:39:37 +0900
Subject: [PATCH 1/6] [CodeGen][MachinePipeliner] Limit register pressure when
scheduling
In software pipelining, when searching for the Initiation Interval (II),
`MachinePipeliner` tries to reduce register pressure, but doesn't check
how many variables can actually alive at the same time. This can result
a lot of register spills/fills can be generated after register
allocation, which might cause performance degradation. To prevent such
cases, this patch adds a check phase that calculates the maximum
register pressure of the scheduled loop and reject it if the pressure is
too high. This can be enabled this by specifying
`pipeliner-register-pressure`. Additionally, an II search range is
currently fixed at 10, which is too small to find a schedule when the
above algorithm is applied. Threfore this patch also adds a new option
`pipeliner-ii-search-range` to specify the length of the range to
search. There is one more new option
`pipeliner-register-pressure-margin`, which can be used to estimate a
register pressure limit less than actual for conservative analysis.
Discourse thread: https://discourse.llvm.org/t/considering-register-pressure-when-deciding-initiation-interval-in-machinepipeliner/74725
---
llvm/include/llvm/CodeGen/MachinePipeliner.h | 18 +-
llvm/lib/CodeGen/MachinePipeliner.cpp | 470 ++++++++++++++++++-
llvm/test/CodeGen/PowerPC/sms-regpress.mir | 330 +++++++++++++
3 files changed, 791 insertions(+), 27 deletions(-)
create mode 100644 llvm/test/CodeGen/PowerPC/sms-regpress.mir
diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index 04055ba9732dd44..8f0a17cf99967b3 100644
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -273,8 +273,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
/// Return the new base register that was stored away for the changed
/// instruction.
- unsigned getInstrBaseReg(SUnit *SU) {
- DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
+ unsigned getInstrBaseReg(SUnit *SU) const {
+ DenseMap<SUnit *, std::pair<unsigned, int64_t>>::const_iterator It =
InstrChanges.find(SU);
if (It != InstrChanges.end())
return It->second.first;
@@ -639,16 +639,20 @@ class SMSchedule {
computeUnpipelineableNodes(SwingSchedulerDAG *SSD,
TargetInstrInfo::PipelinerLoopInfo *PLI);
+ std::deque<SUnit *>
+ reorderInstructions(const SwingSchedulerDAG *SSD,
+ const std::deque<SUnit *> &Instrs) const;
+
bool
normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD,
TargetInstrInfo::PipelinerLoopInfo *PLI);
bool isValidSchedule(SwingSchedulerDAG *SSD);
void finalizeSchedule(SwingSchedulerDAG *SSD);
- void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
- std::deque<SUnit *> &Insts);
- bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi);
- bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Def,
- MachineOperand &MO);
+ void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU,
+ std::deque<SUnit *> &Insts) const;
+ bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const;
+ bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def,
+ MachineOperand &MO) const;
void print(raw_ostream &os) const;
void dump() const;
};
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 8cd7f4ebe88d96a..b11ffb5681e1e6e 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -35,6 +35,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
@@ -60,9 +61,12 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/ModuloSchedule.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -174,6 +178,20 @@ static cl::opt<bool> ExperimentalCodeGen(
cl::desc(
"Use the experimental peeling code generator for software pipelining"));
+static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
+ cl::desc("Range to search for II"),
+ cl::Hidden, cl::init(10));
+
+static cl::opt<bool>
+ LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
+ cl::desc("Limit register pressure of scheduled loop"));
+
+static cl::opt<int>
+ RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
+ cl::init(5),
+ cl::desc("Margin representing the unused percentage of "
+ "the register pressure limit"));
+
namespace llvm {
// A command line option to enable the CopyToPhi DAG mutation.
@@ -484,7 +502,7 @@ void SwingSchedulerDAG::setMAX_II() {
else if (II_setByPragma > 0)
MAX_II = II_setByPragma;
else
- MAX_II = MII + 10;
+ MAX_II = MII + SwpIISearchRange;
}
/// We override the schedule function in ScheduleDAGInstrs to implement the
@@ -695,7 +713,8 @@ static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
}
/// Return the Phi register value that comes the loop block.
-static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
+static unsigned getLoopPhiReg(const MachineInstr &Phi,
+ const MachineBasicBlock *LoopBB) {
for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
if (Phi.getOperand(i + 1).getMBB() == LoopBB)
return Phi.getOperand(i).getReg();
@@ -1102,6 +1121,359 @@ struct FuncUnitSorter {
}
};
+/// Calculate the maximum register pressure of the scheduled instructions stream
+class HighRegisterPressureDetector {
+ MachineBasicBlock *OrigMBB;
+ const MachineFunction &MF;
+ const MachineRegisterInfo &MRI;
+ const TargetRegisterInfo *TRI;
+
+ const unsigned PSetNum;
+
+ // Indexed by PSet ID
+ // InitSetPressure takes into account the register preesure of live-in
+ // registers. It's not depend on how the loop is scheduled, so it's enough to
+ // calculate them once at the begining.
+ std::vector<unsigned> InitSetPressure;
+
+ // Indexed by PSet ID
+ // Upper limit for each register pressure set
+ std::vector<unsigned> PressureSetLimit;
+
+ using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, 4>>;
+
+public:
+ using OrderedInstsTy = std::vector<MachineInstr *>;
+ using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
+
+private:
+ static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
+ if (Pressures.size() == 0) {
+ dbgs() << "[]";
+ } else {
+ char Prefix = '[';
+ for (unsigned P : Pressures) {
+ dbgs() << Prefix << P;
+ Prefix = ' ';
+ }
+ dbgs() << ']';
+ }
+ }
+
+ void dumpPSet(Register Reg) const {
+ dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
+ for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
+ ++PSetIter) {
+ dbgs() << *PSetIter << ' ';
+ }
+ dbgs() << '\n';
+ }
+
+ void increaseRegisterPressure(std::vector<unsigned> &Pressure,
+ Register Reg) const {
+ auto PSetIter = MRI.getPressureSets(Reg);
+ unsigned Weight = PSetIter.getWeight();
+ for (; PSetIter.isValid(); ++PSetIter)
+ Pressure[*PSetIter] += Weight;
+ }
+
+ void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
+ Register Reg) const {
+ auto PSetIter = MRI.getPressureSets(Reg);
+ unsigned Weight = PSetIter.getWeight();
+ for (; PSetIter.isValid(); ++PSetIter) {
+ auto &P = Pressure[*PSetIter];
+ assert(Weight <= P &&
+ "register pressure must be greater or equal than weight");
+ P -= Weight;
+ }
+ }
+
+ // Retrun true if Reg is fixed one, for example, stack pointer
+ bool isFixedRegister(Register Reg) const {
+ return Reg.isPhysical() && TRI->isFixedRegister(MF, Reg.asMCReg());
+ }
+
+ bool isDefinedInThisLoop(Register Reg) const {
+ return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
+ }
+
+ // Search for live-in variables. They are factored into the register pressure
+ // from the begining.
+ void computeLiveIn() {
+ DenseSet<Register> Used;
+ for (const auto &MI : *OrigMBB) {
+ for (const auto &MO : MI.all_uses()) {
+ auto Use = MO.getReg();
+ // Ignore the variable that appears only on one side of phi instruction
+ // because it's used only at the first iteration.
+ if (MI.isPHI() && Use != getLoopPhiReg(MI, OrigMBB))
+ continue;
+ if (isFixedRegister(Use))
+ continue;
+ if (isDefinedInThisLoop(Use))
+ continue;
+ Used.insert(Use);
+ }
+ }
+
+ for (auto LiveIn : Used)
+ increaseRegisterPressure(InitSetPressure, LiveIn);
+ }
+
+ // Calcluate the upper limit of each pressure set
+ void computePressureSetLimit(const RegisterClassInfo &RCI) {
+ for (unsigned PSet = 0; PSet < PSetNum; PSet++)
+ PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
+
+ // We assume fixed registers, such as stack pointer, are already in use.
+ // Therefore subtracting the weight of the fixed registers from the limit of
+ // each pressure set in advance.
+ SmallDenseSet<Register, 8> FixedRegs;
+ for (const TargetRegisterClass *TRC : TRI->regclasses()) {
+ for (const MCPhysReg Reg : *TRC)
+ if (isFixedRegister(Reg) && !FixedRegs.contains(Reg))
+ FixedRegs.insert(Reg);
+ }
+
+ LLVM_DEBUG({
+ for (auto Reg : FixedRegs) {
+ dbgs() << printReg(Reg, TRI, 0, &MRI) << ": [";
+ const int *Sets = TRI->getRegUnitPressureSets(Reg);
+ for (; *Sets != -1; Sets++) {
+ dbgs() << TRI->getRegPressureSetName(*Sets) << ", ";
+ }
+ dbgs() << "]\n";
+ }
+ });
+
+ for (auto Reg : FixedRegs) {
+ LLVM_DEBUG(dbgs() << "fixed register: " << printReg(Reg, TRI, 0, &MRI)
+ << "\n");
+ auto PSetIter = MRI.getPressureSets(Reg);
+ unsigned Weight = PSetIter.getWeight();
+ for (; PSetIter.isValid(); ++PSetIter) {
+ unsigned &Limit = PressureSetLimit[*PSetIter];
+ assert(Weight <= Limit &&
+ "register pressure limit must be greater or equal than weight");
+ Limit -= Weight;
+ LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit
+ << " (decreased by " << Weight << ")\n");
+ }
+ }
+ }
+
+ // There are two patterns of last-use.
+ // - by an instruction of the current iteration
+ // - by a phi instruction of the next iteration (loop carried value)
+ //
+ // Furthermore, following two gropus of instructions are executed
+ // simultaneously
+ // - next iteration's phi instructions in i-th stage
+ // - current iteration's instructions in i+1-th stage
+ //
+ // This function calculates the last-use of each register while taking into
+ // account the above two patterns.
+ Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
+ Instr2StageTy &Stages) const {
+ // We treat virtual registers that are defined and used in this loop.
+ // Following virtual register will be ignored
+ // - live-in one
+ // - defined but not used in the loop (potentially live-out)
+ DenseSet<Register> TargetRegs;
+ const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
+ if (isDefinedInThisLoop(Reg))
+ TargetRegs.insert(Reg);
+ };
+ for (MachineInstr *MI : OrderedInsts) {
+ if (MI->isPHI()) {
+ Register Reg = getLoopPhiReg(*MI, OrigMBB);
+ UpdateTargetRegs(Reg);
+ } else {
+ for (const auto &MO : MI->all_uses())
+ if (MO.isReg())
+ UpdateTargetRegs(MO.getReg());
+ }
+ }
+
+ const auto InstrScore = [&Stages](MachineInstr *MI) {
+ return Stages[MI] + MI->isPHI();
+ };
+
+ DenseMap<Register, MachineInstr *> LastUseMI;
+ for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
+ for (const auto &MO : MI->all_uses()) {
+ if (!MO.isReg())
+ continue;
+ auto Reg = MO.getReg();
+ if (!TargetRegs.contains(Reg))
+ continue;
+ auto Ite = LastUseMI.find(Reg);
+ if (Ite == LastUseMI.end()) {
+ LastUseMI[Reg] = MI;
+ } else {
+ MachineInstr *Orig = Ite->second;
+ MachineInstr *New = MI;
+ if (InstrScore(Orig) < InstrScore(New))
+ LastUseMI[Reg] = New;
+ }
+ }
+ }
+
+ Instr2LastUsesTy LastUses;
+ for (auto &Entry : LastUseMI)
+ LastUses[Entry.second].insert(Entry.first);
+ return LastUses;
+ }
+
+ // Compute the maximum register pressure of the kernel. We'll simulate #Stage
+ // iterations and check the register pressure at the point where all stages
+ // overlapping.
+ //
+ // An example of unrolled loop where #Stage is 4..
+ // Iter i+0 i+1 i+2 i+3
+ // ------------------------
+ // Stage 0
+ // Stage 1 0
+ // Stage 2 1 0
+ // Stage 3 2 1 0 <- All stages overlap
+ //
+ std::vector<unsigned> exec(const OrderedInstsTy &OrderedInsts,
+ Instr2StageTy &Stages,
+ const unsigned StageCount) const {
+ using RegSetTy = SmallDenseSet<Register, 16>;
+
+ // Indexed by #Iter. To treat "local" variables of each stage separately, we
+ // manage the liveness of the registers independently by iterations.
+ SmallVector<RegSetTy> LiveRegSets(StageCount);
+
+ auto CurSetPressure = InitSetPressure;
+ auto MaxSetPressure = InitSetPressure;
+ auto LastUses = std::move(computeLastUses(OrderedInsts, Stages));
+
+ LLVM_DEBUG({
+ dbgs() << "Ordered instructions:\n";
+ for (MachineInstr *MI : OrderedInsts) {
+ dbgs() << "Stage " << Stages[MI] << ": ";
+ MI->dump();
+ }
+ });
+
+ const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
+ Register Reg) {
+ if (!Reg.isValid() || isFixedRegister(Reg))
+ return;
+
+ bool Inserted = RegSet.insert(Reg).second;
+ if (!Inserted)
+ return;
+
+ LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
+ increaseRegisterPressure(CurSetPressure, Reg);
+ LLVM_DEBUG(dumpPSet(Reg));
+ };
+
+ const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
+ Register Reg) {
+ if (!Reg.isValid() || isFixedRegister(Reg))
+ return;
+
+ // live-in register
+ if (!RegSet.contains(Reg))
+ return;
+
+ LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
+ RegSet.erase(Reg);
+ decreaseRegisterPressure(CurSetPressure, Reg);
+ LLVM_DEBUG(dumpPSet(Reg));
+ };
+
+ for (unsigned I = 0; I < StageCount; I++) {
+ for (MachineInstr *MI : OrderedInsts) {
+ const auto Stage = Stages[MI];
+ if (I < Stage)
+ continue;
+
+ const unsigned Iter = I - Stage;
+
+ for (auto &MO : MI->all_defs())
+ InsertReg(LiveRegSets[Iter], MO.getReg());
+
+ for (auto LastUse : LastUses[MI]) {
+ if (MI->isPHI()) {
+ if (Iter != 0)
+ EraseReg(LiveRegSets[Iter - 1], LastUse);
+ } else {
+ EraseReg(LiveRegSets[Iter], LastUse);
+ }
+ }
+
+ for (unsigned PSet = 0; PSet < PSetNum; PSet++)
+ MaxSetPressure[PSet] =
+ std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
+
+ LLVM_DEBUG({
+ dbgs() << "CurSetPressure=";
+ dumpRegisterPressures(CurSetPressure);
+ dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
+ MI->dump();
+ });
+ }
+ }
+
+ return MaxSetPressure;
+ }
+
+public:
+ HighRegisterPressureDetector() = delete;
+
+ HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
+ const MachineFunction &MF)
+ : OrigMBB(OrigMBB), MF(MF), MRI(MF.getRegInfo()),
+ TRI(MF.getSubtarget().getRegisterInfo()),
+ PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
+ PressureSetLimit(PSetNum, 0) {}
+
+ // Used to calculate register pressure, which is independent of loop
+ // scheduling.
+ void init(const RegisterClassInfo &RCI) {
+ computeLiveIn();
+ computePressureSetLimit(RCI);
+ }
+
+ // Calculate the maximum register pressures of the loop and check if they
+ // excced the limit
+ bool detect(const OrderedInstsTy &OrderedInsts, Instr2StageTy &Stages,
+ const unsigned MaxStage) const {
+ assert(0 <= RegPressureMargin && RegPressureMargin <= 100 &&
+ "the percentage of the margin must be between 0 to 100");
+ const auto MaxSetPressure =
+ std::move(exec(OrderedInsts, Stages, MaxStage + 1));
+
+ LLVM_DEBUG({
+ dbgs() << "Dump MaxSetPressure:\n";
+ for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
+ dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
+ }
+ dbgs() << '\n';
+ });
+
+ for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
+ unsigned Limit = PressureSetLimit[PSet];
+ unsigned Margin = Limit * RegPressureMargin / 100;
+ LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
+ << " Margin=" << Margin << "\n");
+ if (Limit < MaxSetPressure[PSet] + Margin) {
+ LLVM_DEBUG(
+ dbgs()
+ << "Rejecte the schedule because of too high register pressure\n");
+ return true;
+ }
+ }
+ return false;
+ }
+};
+
} // end anonymous namespace
/// Calculate the resource constrained minimum initiation interval for the
@@ -1957,6 +2329,41 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
});
}
+/// Create a instruction stream that represents a single iteration and stage of
+/// each instruction. This function differs from SMSchedule::finalizeSchedule in
+/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
+/// function is approximation of SMSchedule::finalizeSchedule with all non-const
+/// operations removed
+static void computeScheduledInsts(const SwingSchedulerDAG *SSD,
+ SMSchedule &Schedule,
+ std::vector<MachineInstr *> &OrderedInsts,
+ DenseMap<MachineInstr *, unsigned> &Stages) {
+ DenseMap<int, std::deque<SUnit *>> Instrs;
+
+ // Move all instructions to the first stage from the later stages.
+ for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
+ ++Cycle) {
+ for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
+ Stage <= LastStage; ++Stage) {
+ for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
+ Cycle + Stage * Schedule.getInitiationInterval()))) {
+ Instrs[Cycle].push_front(SU);
+ }
+ }
+ }
+
+ for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
+ ++Cycle) {
+ std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
+ CycleInstrs = std::move(Schedule.reorderInstructions(SSD, CycleInstrs));
+ for (SUnit *SU : CycleInstrs) {
+ MachineInstr *MI = SU->getInstr();
+ OrderedInsts.push_back(MI);
+ Stages[MI] = Schedule.stageScheduled(SU);
+ }
+ }
+}
+
/// Process the nodes in the computed order and create the pipelined schedule
/// of the instructions, if possible. Return true if a schedule is found.
bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
@@ -1967,6 +2374,12 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
}
bool scheduleFound = false;
+ std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
+ if (LimitRegPressure) {
+ HRPDetector =
+ std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
+ HRPDetector->init(RegClassInfo);
+ }
// Keep increasing II until a valid schedule is found.
for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
Schedule.reset();
@@ -2044,6 +2457,16 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
// If a schedule is found, check if it is a valid schedule too.
if (scheduleFound)
scheduleFound = Schedule.isValidSchedule(this);
+
+ // If a schedule found and the option is enabled, check if the schedule
+ // might generate additional register spill/fill
+ if (scheduleFound && LimitRegPressure) {
+ std::vector<MachineInstr *> OrderedInsts;
+ DenseMap<MachineInstr *, unsigned> Stages;
+ computeScheduledInsts(this, Schedule, OrderedInsts, Stages);
+ scheduleFound = !HRPDetector->detect(OrderedInsts, Stages,
+ Schedule.getMaxStageCount());
+ }
}
LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
@@ -2483,8 +2906,8 @@ void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
/// Order the instructions within a cycle so that the definitions occur
/// before the uses. Returns true if the instruction is added to the start
/// of the list, or false if added to the end.
-void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
- std::deque<SUnit *> &Insts) {
+void SMSchedule::orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU,
+ std::deque<SUnit *> &Insts) const {
MachineInstr *MI = SU->getInstr();
bool OrderBeforeUse = false;
bool OrderAfterDef = false;
@@ -2611,7 +3034,8 @@ void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
}
/// Return true if the scheduled Phi has a loop carried operand.
-bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
+bool SMSchedule::isLoopCarried(const SwingSchedulerDAG *SSD,
+ MachineInstr &Phi) const {
if (!Phi.isPHI())
return false;
assert(Phi.isPHI() && "Expecting a Phi.");
@@ -2639,8 +3063,9 @@ bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
/// (MO) = v1
/// If MO appears before Def, then v1 and v3 may get assigned to the same
/// register.
-bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
- MachineInstr *Def, MachineOperand &MO) {
+bool SMSchedule::isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD,
+ MachineInstr *Def,
+ MachineOperand &MO) const {
if (!MO.isReg())
return false;
if (Def->isPHI())
@@ -2895,6 +3320,23 @@ void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
}
}
+std::deque<SUnit *>
+SMSchedule::reorderInstructions(const SwingSchedulerDAG *SSD,
+ const std::deque<SUnit *> &Instrs) const {
+ std::deque<SUnit *> NewOrderPhi;
+ for (SUnit *SU : Instrs) {
+ if (SU->getInstr()->isPHI())
+ NewOrderPhi.push_back(SU);
+ }
+ std::deque<SUnit *> NewOrderI;
+ for (SUnit *SU : Instrs) {
+ if (!SU->getInstr()->isPHI())
+ orderDependence(SSD, SU, NewOrderI);
+ }
+ llvm::append_range(NewOrderPhi, NewOrderI);
+ return NewOrderPhi;
+}
+
/// After the schedule has been formed, call this function to combine
/// the instructions from the different stages/cycles. That is, this
/// function creates a schedule that represents a single iteration.
@@ -2924,19 +3366,7 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
// generated code.
for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
- std::deque<SUnit *> newOrderPhi;
- for (SUnit *SU : cycleInstrs) {
- if (SU->getInstr()->isPHI())
- newOrderPhi.push_back(SU);
- }
- std::deque<SUnit *> newOrderI;
- for (SUnit *SU : cycleInstrs) {
- if (!SU->getInstr()->isPHI())
- orderDependence(SSD, SU, newOrderI);
- }
- // Replace the old order with the new order.
- cycleInstrs.swap(newOrderPhi);
- llvm::append_range(cycleInstrs, newOrderI);
+ cycleInstrs = std::move(reorderInstructions(SSD, cycleInstrs));
SSD->fixupRegisterOverlaps(cycleInstrs);
}
diff --git a/llvm/test/CodeGen/PowerPC/sms-regpress.mir b/llvm/test/CodeGen/PowerPC/sms-regpress.mir
new file mode 100644
index 000000000000000..e5d0ba137d86993
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/sms-regpress.mir
@@ -0,0 +1,330 @@
+# RUN: llc --verify-machineinstrs -mcpu=pwr9 -o - %s -run-pass=pipeliner -ppc-enable-pipeliner -pipeliner-register-pressure -pipeliner-max-mii=50 -pipeliner-ii-search-range=30 -pipeliner-max-stages=10 -debug-only=pipeliner 2>&1 | FileCheck %s
+
+# Check that if the register pressure is too high, the schedule is rejected, II is incremented, and scheduling continues.
+# The specific value of II is not important.
+
+# CHECK: Try to schedule with 21
+# CHECK: Can't schedule
+# CHECK: Try to schedule with 22
+# CHECK: Can't schedule
+# CHECK: Try to schedule with 23
+# CHECK: Rejecte the schedule because of too high register pressure
+# CHECK: Try to schedule with 24
+# CHECK: Rejecte the schedule because of too high register pressure
+# CHECK: Try to schedule with 25
+# CHECK: Rejecte the schedule because of too high register pressure
+# CHECK: Try to schedule with 26
+# CHECK: Rejecte the schedule because of too high register pressure
+# CHECK: Try to schedule with 27
+# CHECK: Schedule Found? 1 (II=27)
+
+--- |
+ ; ModuleID = 'a.ll'
+ source_filename = "a.c"
+ target datalayout = "e-m:e-Fn32-i64:64-n32:64"
+ target triple = "ppc64le"
+
+ ; Function Attrs: nofree nosync nounwind memory(argmem: read) uwtable
+ define dso_local double @kernel(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef signext %n) local_unnamed_addr #0 {
+ entry:
+ %0 = load double, ptr %a, align 8, !tbaa !3
+ %arrayidx1 = getelementptr inbounds double, ptr %a, i64 1
+ %1 = load double, ptr %arrayidx1, align 8, !tbaa !3
+ %cmp163 = icmp sgt i32 %n, 0
+ br i1 %cmp163, label %for.body.preheader, label %for.cond.cleanup
+
+ for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %n to i64
+ %scevgep1 = getelementptr i8, ptr %b, i64 -8
+ call void @llvm.set.loop.iterations.i64(i64 %wide.trip.count)
+ br label %for.body
+
+ for.cond.cleanup: ; preds = %for.body, %entry
+ %res.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %30, %for.body ]
+ ret double %res.0.lcssa
+
+ for.body: ; preds = %for.body, %for.body.preheader
+ %res.0165 = phi double [ 0.000000e+00, %for.body.preheader ], [ %30, %for.body ]
+ %2 = phi ptr [ %scevgep1, %for.body.preheader ], [ %3, %for.body ]
+ %3 = getelementptr i8, ptr %2, i64 8
+ %4 = load double, ptr %3, align 8, !tbaa !3
+ %5 = tail call double @llvm.fmuladd.f64(double %0, double %4, double %0)
+ %6 = tail call double @llvm.fmuladd.f64(double %5, double %4, double %5)
+ %7 = tail call double @llvm.fmuladd.f64(double %6, double %4, double %6)
+ %8 = tail call double @llvm.fmuladd.f64(double %7, double %4, double %7)
+ %9 = tail call double @llvm.fmuladd.f64(double %8, double %4, double %8)
+ %10 = tail call double @llvm.fmuladd.f64(double %9, double %4, double %9)
+ %11 = tail call double @llvm.fmuladd.f64(double %10, double %4, double %10)
+ %12 = tail call double @llvm.fmuladd.f64(double %11, double %4, double %11)
+ %13 = tail call double @llvm.fmuladd.f64(double %12, double %4, double %12)
+ %14 = tail call double @llvm.fmuladd.f64(double %13, double %4, double %13)
+ %15 = tail call double @llvm.fmuladd.f64(double %14, double %4, double %14)
+ %16 = tail call double @llvm.fmuladd.f64(double %15, double %4, double %15)
+ %17 = tail call double @llvm.fmuladd.f64(double %16, double %4, double %16)
+ %18 = tail call double @llvm.fmuladd.f64(double %17, double %4, double %17)
+ %19 = tail call double @llvm.fmuladd.f64(double %18, double %4, double %18)
+ %20 = tail call double @llvm.fmuladd.f64(double %19, double %4, double %19)
+ %add = fadd double %19, %20
+ %21 = tail call double @llvm.fmuladd.f64(double %20, double %4, double %add)
+ %add35 = fadd double %12, %21
+ %22 = tail call double @llvm.fmuladd.f64(double %5, double %4, double %add35)
+ %add38 = fadd double %13, %22
+ %23 = tail call double @llvm.fmuladd.f64(double %6, double %4, double %add38)
+ %mul = fmul double %4, %7
+ %mul46 = fmul double %mul, %14
+ %24 = tail call double @llvm.fmuladd.f64(double %mul46, double %13, double %16)
+ %mul50 = fmul double %4, %9
+ %mul51 = fmul double %1, %mul50
+ %25 = tail call double @llvm.fmuladd.f64(double %mul51, double %11, double %24)
+ %add53 = fadd double %5, %25
+ %add54 = fadd double %6, %add53
+ %mul55 = fmul double %14, %16
+ %mul56 = fmul double %mul55, %17
+ %mul57 = fmul double %mul56, %18
+ %26 = tail call double @llvm.fmuladd.f64(double %mul57, double %19, double %add54)
+ %27 = tail call double @llvm.fmuladd.f64(double %10, double %1, double %26)
+ %28 = tail call double @llvm.fmuladd.f64(double %8, double %6, double %27)
+ %mul61 = fmul double %20, %21
+ %mul62 = fmul double %mul61, %22
+ %29 = tail call double @llvm.fmuladd.f64(double %mul62, double %23, double %28)
+ %mul64 = fmul double %26, %29
+ %mul65 = fmul double %24, %mul64
+ %mul66 = fmul double %12, %mul65
+ %30 = tail call double @llvm.fmuladd.f64(double %mul66, double %10, double %res.0165)
+ %31 = call i1 @llvm.loop.decrement.i64(i64 1)
+ br i1 %31, label %for.body, label %for.cond.cleanup, !llvm.loop !7
+ }
+
+ ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+ declare double @llvm.fmuladd.f64(double, double, double) #1
+
+ ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+ declare void @llvm.set.loop.iterations.i64(i64) #2
+
+ ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+ declare i1 @llvm.loop.decrement.i64(i64) #2
+
+ attributes #0 = { nofree nosync nounwind memory(argmem: read) uwtable "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crbits,+crypto,+direct-move,+extdiv,+htm,+isa-v206-instructions,+isa-v207-instructions,+isa-v30-instructions,+power8-vector,+power9-vector,+quadword-atomics,+vsx,-aix-small-local-exec-tls,-privileged,-rop-protect,-spe" }
+ attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+ attributes #2 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+ !llvm.module.flags = !{!0, !1}
+ !llvm.ident = !{!2}
+
+ !0 = !{i32 1, !"wchar_size", i32 4}
+ !1 = !{i32 7, !"uwtable", i32 2}
+ !2 = !{!"clang version 18.0.0 (https://miratech-soft@dev.azure.com/miratech-soft/llvm/_git/llvm c8d01fb665fc5d9378100a6d92ebcd3be49be655)"}
+ !3 = !{!4, !4, i64 0}
+ !4 = !{!"double", !5, i64 0}
+ !5 = !{!"omnipotent char", !6, i64 0}
+ !6 = !{!"Simple C/C++ TBAA"}
+ !7 = distinct !{!7, !8, !9}
+ !8 = !{!"llvm.loop.mustprogress"}
+ !9 = !{!"llvm.loop.unroll.disable"}
+
+...
+---
+name: kernel
+alignment: 16
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHCatchret: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vsfrc, preferred-register: '' }
+ - { id: 1, class: vsfrc, preferred-register: '' }
+ - { id: 2, class: g8rc, preferred-register: '' }
+ - { id: 3, class: vsfrc, preferred-register: '' }
+ - { id: 4, class: vsfrc, preferred-register: '' }
+ - { id: 5, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+ - { id: 6, class: g8rc, preferred-register: '' }
+ - { id: 7, class: vsfrc, preferred-register: '' }
+ - { id: 8, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+ - { id: 9, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+ - { id: 10, class: g8rc, preferred-register: '' }
+ - { id: 11, class: gprc, preferred-register: '' }
+ - { id: 12, class: vsfrc, preferred-register: '' }
+ - { id: 13, class: crrc, preferred-register: '' }
+ - { id: 14, class: vsfrc, preferred-register: '' }
+ - { id: 15, class: g8rc, preferred-register: '' }
+ - { id: 16, class: g8rc, preferred-register: '' }
+ - { id: 17, class: g8rc, preferred-register: '' }
+ - { id: 18, class: f8rc, preferred-register: '' }
+ - { id: 19, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+ - { id: 20, class: vsfrc, preferred-register: '' }
+ - { id: 21, class: vsfrc, preferred-register: '' }
+ - { id: 22, class: vsfrc, preferred-register: '' }
+ - { id: 23, class: vsfrc, preferred-register: '' }
+ - { id: 24, class: vsfrc, preferred-register: '' }
+ - { id: 25, class: vsfrc, preferred-register: '' }
+ - { id: 26, class: vsfrc, preferred-register: '' }
+ - { id: 27, class: vsfrc, preferred-register: '' }
+ - { id: 28, class: vsfrc, preferred-register: '' }
+ - { id: 29, class: vsfrc, preferred-register: '' }
+ - { id: 30, class: vsfrc, preferred-register: '' }
+ - { id: 31, class: vsfrc, preferred-register: '' }
+ - { id: 32, class: vsfrc, preferred-register: '' }
+ - { id: 33, class: vsfrc, preferred-register: '' }
+ - { id: 34, class: vsfrc, preferred-register: '' }
+ - { id: 35, class: vsfrc, preferred-register: '' }
+ - { id: 36, class: vsfrc, preferred-register: '' }
+ - { id: 37, class: vsfrc, preferred-register: '' }
+ - { id: 38, class: vsfrc, preferred-register: '' }
+ - { id: 39, class: vsfrc, preferred-register: '' }
+ - { id: 40, class: vsfrc, preferred-register: '' }
+ - { id: 41, class: vsfrc, preferred-register: '' }
+ - { id: 42, class: vsfrc, preferred-register: '' }
+ - { id: 43, class: vsfrc, preferred-register: '' }
+ - { id: 44, class: vsfrc, preferred-register: '' }
+ - { id: 45, class: vsfrc, preferred-register: '' }
+ - { id: 46, class: vsfrc, preferred-register: '' }
+ - { id: 47, class: vsfrc, preferred-register: '' }
+ - { id: 48, class: vsfrc, preferred-register: '' }
+ - { id: 49, class: vsfrc, preferred-register: '' }
+ - { id: 50, class: vsfrc, preferred-register: '' }
+ - { id: 51, class: vsfrc, preferred-register: '' }
+ - { id: 52, class: vsfrc, preferred-register: '' }
+ - { id: 53, class: vsfrc, preferred-register: '' }
+ - { id: 54, class: vsfrc, preferred-register: '' }
+ - { id: 55, class: vsfrc, preferred-register: '' }
+ - { id: 56, class: vsfrc, preferred-register: '' }
+ - { id: 57, class: vsfrc, preferred-register: '' }
+ - { id: 58, class: vsfrc, preferred-register: '' }
+ - { id: 59, class: vsfrc, preferred-register: '' }
+ - { id: 60, class: vsfrc, preferred-register: '' }
+ - { id: 61, class: vsfrc, preferred-register: '' }
+ - { id: 62, class: crbitrc, preferred-register: '' }
+liveins:
+ - { reg: '$x3', virtual-reg: '%8' }
+ - { reg: '$x4', virtual-reg: '%9' }
+ - { reg: '$x5', virtual-reg: '%10' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ successors: %bb.2(0x50000000), %bb.1(0x30000000)
+ liveins: $x3, $x4, $x5
+
+ %10:g8rc = COPY killed $x5
+ %9:g8rc_and_g8rc_nox0 = COPY killed $x4
+ %8:g8rc_and_g8rc_nox0 = COPY killed $x3
+ %11:gprc = COPY killed %10.sub_32
+ %13:crrc = CMPWI %11, 0
+ BCC 44, killed %13, %bb.2
+
+ bb.1:
+ successors: %bb.3(0x80000000)
+
+ %12:vsfrc = XXLXORdpz
+ B %bb.3
+
+ bb.2.for.body.preheader:
+ successors: %bb.4(0x80000000)
+
+ %0:vsfrc = DFLOADf64 0, %8 :: (load (s64) from %ir.a, !tbaa !3)
+ %1:vsfrc = DFLOADf64 8, killed %8 :: (load (s64) from %ir.arrayidx1, !tbaa !3)
+ %16:g8rc = IMPLICIT_DEF
+ %15:g8rc = INSERT_SUBREG killed %16, killed %11, %subreg.sub_32
+ %17:g8rc = RLDICL killed %15, 0, 32
+ %2:g8rc = ADDI8 killed %9, -8
+ MTCTR8loop killed %17, implicit-def dead $ctr8
+ %14:vsfrc = XXLXORdpz
+ B %bb.4
+
+ bb.3.for.cond.cleanup:
+ %3:vsfrc = PHI %12, %bb.1, %7, %bb.4
+ $f1 = COPY killed %3
+ BLR8 implicit $lr8, implicit $rm, implicit killed $f1
+
+ bb.4.for.body:
+ successors: %bb.4(0x7c000000), %bb.3(0x04000000)
+
+ %4:vsfrc = PHI %14, %bb.2, %7, %bb.4
+ %5:g8rc_and_g8rc_nox0 = PHI %2, %bb.2, %6, %bb.4
+ %18:f8rc, %19:g8rc_and_g8rc_nox0 = LFDU 8, killed %5 :: (load (s64) from %ir.3, !tbaa !3)
+ %6:g8rc = COPY killed %19
+ %20:vsfrc = nofpexcept XSMADDADP %0, %0, %18, implicit $rm
+ %21:vsfrc = nofpexcept XSMADDADP %20, %20, %18, implicit $rm
+ %22:vsfrc = nofpexcept XSMADDADP %21, %21, %18, implicit $rm
+ %23:vsfrc = nofpexcept XSMADDADP %22, %22, %18, implicit $rm
+ %24:vsfrc = nofpexcept XSMADDADP %23, %23, %18, implicit $rm
+ %25:vsfrc = nofpexcept XSMADDADP %24, %24, %18, implicit $rm
+ %26:vsfrc = nofpexcept XSMADDADP %25, %25, %18, implicit $rm
+ %27:vsfrc = nofpexcept XSMADDADP %26, %26, %18, implicit $rm
+ %28:vsfrc = nofpexcept XSMADDADP %27, %27, %18, implicit $rm
+ %29:vsfrc = nofpexcept XSMADDADP %28, %28, %18, implicit $rm
+ %30:vsfrc = nofpexcept XSMADDADP %29, %29, %18, implicit $rm
+ %31:vsfrc = nofpexcept XSMADDADP killed %30, %30, %18, implicit $rm
+ %32:vsfrc = nofpexcept XSMADDADP %31, %31, %18, implicit $rm
+ %33:vsfrc = nofpexcept XSMADDADP %32, %32, %18, implicit $rm
+ %34:vsfrc = nofpexcept XSMADDADP %33, %33, %18, implicit $rm
+ %35:vsfrc = nofpexcept XSMADDADP %34, %34, %18, implicit $rm
+ %36:vsfrc = nofpexcept XSADDDP %34, %35, implicit $rm
+ %37:vsfrc = nofpexcept XSMADDADP killed %36, %35, %18, implicit $rm
+ %38:vsfrc = nofpexcept XSADDDP %27, %37, implicit $rm
+ %39:vsfrc = nofpexcept XSMADDADP killed %38, %20, %18, implicit $rm
+ %40:vsfrc = nofpexcept XSADDDP %28, %39, implicit $rm
+ %41:vsfrc = nofpexcept XSMADDADP killed %40, %21, %18, implicit $rm
+ %42:vsfrc = nofpexcept XSMULDP %18, killed %22, implicit $rm
+ %43:vsfrc = nofpexcept XSMULDP killed %42, %29, implicit $rm
+ %44:vsfrc = nofpexcept XSMADDADP %31, killed %43, killed %28, implicit $rm
+ %45:vsfrc = nofpexcept XSMULDP killed %18, killed %24, implicit $rm
+ %46:vsfrc = nofpexcept XSMULDP %1, killed %45, implicit $rm
+ %47:vsfrc = nofpexcept XSMADDADP %44, killed %46, killed %26, implicit $rm
+ %48:vsfrc = nofpexcept XSADDDP killed %20, killed %47, implicit $rm
+ %49:vsfrc = nofpexcept XSADDDP %21, killed %48, implicit $rm
+ %50:vsfrc = nofpexcept XSMULDP killed %29, killed %31, implicit $rm
+ %51:vsfrc = nofpexcept XSMULDP killed %50, killed %32, implicit $rm
+ %52:vsfrc = nofpexcept XSMULDP killed %51, killed %33, implicit $rm
+ %53:vsfrc = nofpexcept XSMADDADP killed %49, killed %52, killed %34, implicit $rm
+ %54:vsfrc = nofpexcept XSMADDADP %53, %25, %1, implicit $rm
+ %55:vsfrc = nofpexcept XSMADDADP killed %54, killed %23, killed %21, implicit $rm
+ %56:vsfrc = nofpexcept XSMULDP killed %35, killed %37, implicit $rm
+ %57:vsfrc = nofpexcept XSMULDP killed %56, killed %39, implicit $rm
+ %58:vsfrc = nofpexcept XSMADDADP killed %55, killed %57, killed %41, implicit $rm
+ %59:vsfrc = nofpexcept XSMULDP killed %53, killed %58, implicit $rm
+ %60:vsfrc = nofpexcept XSMULDP killed %44, killed %59, implicit $rm
+ %61:vsfrc = nofpexcept XSMULDP killed %27, killed %60, implicit $rm
+ %7:vsfrc = nofpexcept XSMADDADP killed %4, killed %61, killed %25, implicit $rm
+ BDNZ8 %bb.4, implicit-def $ctr8, implicit $ctr8
+ B %bb.3
+
+...
>From 1d2b6438a6f29b0665de0aba23f52f7c06affc33 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Thu, 4 Jan 2024 19:32:39 +0900
Subject: [PATCH 2/6] Apply suggestions from code review
Co-authored-by: Leandro Lupori <leandro.lupori at gmail.com>
---
llvm/lib/CodeGen/MachinePipeliner.cpp | 32 +++++++++++++--------------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index b11ffb5681e1e6e..0f154bc859b7ddc 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1131,9 +1131,9 @@ class HighRegisterPressureDetector {
const unsigned PSetNum;
// Indexed by PSet ID
- // InitSetPressure takes into account the register preesure of live-in
+ // InitSetPressure takes into account the register pressure of live-in
// registers. It's not depend on how the loop is scheduled, so it's enough to
- // calculate them once at the begining.
+ // calculate them once at the beginning.
std::vector<unsigned> InitSetPressure;
// Indexed by PSet ID
@@ -1183,13 +1183,13 @@ class HighRegisterPressureDetector {
unsigned Weight = PSetIter.getWeight();
for (; PSetIter.isValid(); ++PSetIter) {
auto &P = Pressure[*PSetIter];
- assert(Weight <= P &&
- "register pressure must be greater or equal than weight");
+ assert(P >= Weight &&
+ "register pressure must be greater than or equal weight");
P -= Weight;
}
}
- // Retrun true if Reg is fixed one, for example, stack pointer
+ // Return true if Reg is fixed one, for example, stack pointer
bool isFixedRegister(Register Reg) const {
return Reg.isPhysical() && TRI->isFixedRegister(MF, Reg.asMCReg());
}
@@ -1221,7 +1221,7 @@ class HighRegisterPressureDetector {
increaseRegisterPressure(InitSetPressure, LiveIn);
}
- // Calcluate the upper limit of each pressure set
+ // Calculate the upper limit of each pressure set
void computePressureSetLimit(const RegisterClassInfo &RCI) {
for (unsigned PSet = 0; PSet < PSetNum; PSet++)
PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
@@ -1254,8 +1254,8 @@ class HighRegisterPressureDetector {
unsigned Weight = PSetIter.getWeight();
for (; PSetIter.isValid(); ++PSetIter) {
unsigned &Limit = PressureSetLimit[*PSetIter];
- assert(Weight <= Limit &&
- "register pressure limit must be greater or equal than weight");
+ assert(Limit >= Weight &&
+ "register pressure limit must be greater than or equal weight");
Limit -= Weight;
LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit
<< " (decreased by " << Weight << ")\n");
@@ -1267,7 +1267,7 @@ class HighRegisterPressureDetector {
// - by an instruction of the current iteration
// - by a phi instruction of the next iteration (loop carried value)
//
- // Furthermore, following two gropus of instructions are executed
+ // Furthermore, following two groups of instructions are executed
// simultaneously
// - next iteration's phi instructions in i-th stage
// - current iteration's instructions in i+1-th stage
@@ -1442,7 +1442,7 @@ class HighRegisterPressureDetector {
}
// Calculate the maximum register pressures of the loop and check if they
- // excced the limit
+ // exceed the limit
bool detect(const OrderedInstsTy &OrderedInsts, Instr2StageTy &Stages,
const unsigned MaxStage) const {
assert(0 <= RegPressureMargin && RegPressureMargin <= 100 &&
@@ -1466,7 +1466,7 @@ class HighRegisterPressureDetector {
if (Limit < MaxSetPressure[PSet] + Margin) {
LLVM_DEBUG(
dbgs()
- << "Rejecte the schedule because of too high register pressure\n");
+ << "Rejected the schedule because of too high register pressure\n");
return true;
}
}
@@ -2329,11 +2329,11 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
});
}
-/// Create a instruction stream that represents a single iteration and stage of
+/// Create an instruction stream that represents a single iteration and stage of
/// each instruction. This function differs from SMSchedule::finalizeSchedule in
/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
-/// function is approximation of SMSchedule::finalizeSchedule with all non-const
-/// operations removed
+/// function is an approximation of SMSchedule::finalizeSchedule with all non-const
+/// operations removed.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD,
SMSchedule &Schedule,
std::vector<MachineInstr *> &OrderedInsts,
@@ -2458,8 +2458,8 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
if (scheduleFound)
scheduleFound = Schedule.isValidSchedule(this);
- // If a schedule found and the option is enabled, check if the schedule
- // might generate additional register spill/fill
+ // If a schedule was found and the option is enabled, check if the schedule
+ // might generate additional register spills/fills.
if (scheduleFound && LimitRegPressure) {
std::vector<MachineInstr *> OrderedInsts;
DenseMap<MachineInstr *, unsigned> Stages;
>From 3e08340eeea6ff9de936369e197850eecbbd3db6 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Thu, 4 Jan 2024 20:42:44 +0900
Subject: [PATCH 3/6] fixup! [CodeGen][MachinePipeliner] Limit register
pressure when scheduling
---
llvm/lib/CodeGen/MachinePipeliner.cpp | 107 ++++++++++++++------------
1 file changed, 56 insertions(+), 51 deletions(-)
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 0f154bc859b7ddc..acf471dc6d6edea 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1015,6 +1015,41 @@ void SwingSchedulerDAG::changeDependences() {
}
}
+/// Create an instruction stream that represents a single iteration and stage of
+/// each instruction. This function differs from SMSchedule::finalizeSchedule in
+/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
+/// function is an approximation of SMSchedule::finalizeSchedule with all
+/// non-const operations removed.
+static void computeScheduledInsts(const SwingSchedulerDAG *SSD,
+ SMSchedule &Schedule,
+ std::vector<MachineInstr *> &OrderedInsts,
+ DenseMap<MachineInstr *, unsigned> &Stages) {
+ DenseMap<int, std::deque<SUnit *>> Instrs;
+
+ // Move all instructions to the first stage from the later stages.
+ for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
+ ++Cycle) {
+ for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
+ Stage <= LastStage; ++Stage) {
+ for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
+ Cycle + Stage * Schedule.getInitiationInterval()))) {
+ Instrs[Cycle].push_front(SU);
+ }
+ }
+ }
+
+ for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
+ ++Cycle) {
+ std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
+ CycleInstrs = std::move(Schedule.reorderInstructions(SSD, CycleInstrs));
+ for (SUnit *SU : CycleInstrs) {
+ MachineInstr *MI = SU->getInstr();
+ OrderedInsts.push_back(MI);
+ Stages[MI] = Schedule.stageScheduled(SU);
+ }
+ }
+}
+
namespace {
// FuncUnitSorter - Comparison operator used to sort instructions by
@@ -1199,7 +1234,13 @@ class HighRegisterPressureDetector {
}
// Search for live-in variables. They are factored into the register pressure
- // from the begining.
+ // from the begining. Live-in variables used by every iteration should be
+ // considered as alive throughout the loop. For example, the variable `c` in
+ // following code. \code
+ // int c = ...;
+ // for (int i = 0; i < n; i++)
+ // a[i] += b[i] + c;
+ // \endcode
void computeLiveIn() {
DenseSet<Register> Used;
for (const auto &MI : *OrigMBB) {
@@ -1232,7 +1273,7 @@ class HighRegisterPressureDetector {
SmallDenseSet<Register, 8> FixedRegs;
for (const TargetRegisterClass *TRC : TRI->regclasses()) {
for (const MCPhysReg Reg : *TRC)
- if (isFixedRegister(Reg) && !FixedRegs.contains(Reg))
+ if (isFixedRegister(Reg))
FixedRegs.insert(Reg);
}
@@ -1338,9 +1379,10 @@ class HighRegisterPressureDetector {
// Stage 2 1 0
// Stage 3 2 1 0 <- All stages overlap
//
- std::vector<unsigned> exec(const OrderedInstsTy &OrderedInsts,
- Instr2StageTy &Stages,
- const unsigned StageCount) const {
+ std::vector<unsigned>
+ computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
+ Instr2StageTy &Stages,
+ const unsigned StageCount) const {
using RegSetTy = SmallDenseSet<Register, 16>;
// Indexed by #Iter. To treat "local" variables of each stage separately, we
@@ -1425,8 +1467,6 @@ class HighRegisterPressureDetector {
}
public:
- HighRegisterPressureDetector() = delete;
-
HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
const MachineFunction &MF)
: OrigMBB(OrigMBB), MF(MF), MRI(MF.getRegInfo()),
@@ -1443,12 +1483,16 @@ class HighRegisterPressureDetector {
// Calculate the maximum register pressures of the loop and check if they
// exceed the limit
- bool detect(const OrderedInstsTy &OrderedInsts, Instr2StageTy &Stages,
+ bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
const unsigned MaxStage) const {
assert(0 <= RegPressureMargin && RegPressureMargin <= 100 &&
"the percentage of the margin must be between 0 to 100");
+
+ OrderedInstsTy OrderedInsts;
+ Instr2StageTy Stages;
+ computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
const auto MaxSetPressure =
- std::move(exec(OrderedInsts, Stages, MaxStage + 1));
+ std::move(computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1));
LLVM_DEBUG({
dbgs() << "Dump MaxSetPressure:\n";
@@ -2329,41 +2373,6 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
});
}
-/// Create an instruction stream that represents a single iteration and stage of
-/// each instruction. This function differs from SMSchedule::finalizeSchedule in
-/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
-/// function is an approximation of SMSchedule::finalizeSchedule with all non-const
-/// operations removed.
-static void computeScheduledInsts(const SwingSchedulerDAG *SSD,
- SMSchedule &Schedule,
- std::vector<MachineInstr *> &OrderedInsts,
- DenseMap<MachineInstr *, unsigned> &Stages) {
- DenseMap<int, std::deque<SUnit *>> Instrs;
-
- // Move all instructions to the first stage from the later stages.
- for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
- ++Cycle) {
- for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
- Stage <= LastStage; ++Stage) {
- for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
- Cycle + Stage * Schedule.getInitiationInterval()))) {
- Instrs[Cycle].push_front(SU);
- }
- }
- }
-
- for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
- ++Cycle) {
- std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
- CycleInstrs = std::move(Schedule.reorderInstructions(SSD, CycleInstrs));
- for (SUnit *SU : CycleInstrs) {
- MachineInstr *MI = SU->getInstr();
- OrderedInsts.push_back(MI);
- Stages[MI] = Schedule.stageScheduled(SU);
- }
- }
-}
-
/// Process the nodes in the computed order and create the pipelined schedule
/// of the instructions, if possible. Return true if a schedule is found.
bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
@@ -2460,13 +2469,9 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
// If a schedule was found and the option is enabled, check if the schedule
// might generate additional register spills/fills.
- if (scheduleFound && LimitRegPressure) {
- std::vector<MachineInstr *> OrderedInsts;
- DenseMap<MachineInstr *, unsigned> Stages;
- computeScheduledInsts(this, Schedule, OrderedInsts, Stages);
- scheduleFound = !HRPDetector->detect(OrderedInsts, Stages,
- Schedule.getMaxStageCount());
- }
+ if (scheduleFound && LimitRegPressure)
+ scheduleFound =
+ !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
}
LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
>From 082b3fe760edf7254caa831325ff68406673f73a Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Tue, 9 Jan 2024 13:36:00 +0900
Subject: [PATCH 4/6] fixup! [CodeGen][MachinePipeliner] Limit register
pressure when scheduling
---
llvm/test/CodeGen/PowerPC/sms-regpress.mir | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/test/CodeGen/PowerPC/sms-regpress.mir b/llvm/test/CodeGen/PowerPC/sms-regpress.mir
index e5d0ba137d86993..d61ead2fc089a7b 100644
--- a/llvm/test/CodeGen/PowerPC/sms-regpress.mir
+++ b/llvm/test/CodeGen/PowerPC/sms-regpress.mir
@@ -8,13 +8,13 @@
# CHECK: Try to schedule with 22
# CHECK: Can't schedule
# CHECK: Try to schedule with 23
-# CHECK: Rejecte the schedule because of too high register pressure
+# CHECK: Rejected the schedule because of too high register pressure
# CHECK: Try to schedule with 24
-# CHECK: Rejecte the schedule because of too high register pressure
+# CHECK: Rejected the schedule because of too high register pressure
# CHECK: Try to schedule with 25
-# CHECK: Rejecte the schedule because of too high register pressure
+# CHECK: Rejected the schedule because of too high register pressure
# CHECK: Try to schedule with 26
-# CHECK: Rejecte the schedule because of too high register pressure
+# CHECK: Rejected the schedule because of too high register pressure
# CHECK: Try to schedule with 27
# CHECK: Schedule Found? 1 (II=27)
>From abc4fa2f55304358ff09274e939afecb6e3e3355 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Mon, 15 Jan 2024 15:38:20 +0900
Subject: [PATCH 5/6] fixup! [CodeGen][MachinePipeliner] Limit register
pressure when scheduling
---
llvm/lib/CodeGen/MachinePipeliner.cpp | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index acf471dc6d6edea..3a5656a41e3f21d 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1244,6 +1244,8 @@ class HighRegisterPressureDetector {
void computeLiveIn() {
DenseSet<Register> Used;
for (const auto &MI : *OrigMBB) {
+ if (MI.isDebugInstr())
+ continue;
for (const auto &MO : MI.all_uses()) {
auto Use = MO.getReg();
// Ignore the variable that appears only on one side of phi instruction
>From 31b4b505e023cc44a855ed3f726da348f95aa5b3 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Wed, 17 Jan 2024 19:33:29 +0900
Subject: [PATCH 6/6] fixup! [CodeGen][MachinePipeliner] Limit register
pressure when scheduling
---
llvm/lib/CodeGen/MachinePipeliner.cpp | 37 ++++++++++++----------
llvm/test/CodeGen/PowerPC/sms-regpress.mir | 4 +--
2 files changed, 22 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 3a5656a41e3f21d..5c9f0f1703a6e4a 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1175,6 +1175,8 @@ class HighRegisterPressureDetector {
// Upper limit for each register pressure set
std::vector<unsigned> PressureSetLimit;
+ DenseMap<MachineInstr *, RegisterOperands> ROMap;
+
using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, 4>>;
public:
@@ -1243,20 +1245,20 @@ class HighRegisterPressureDetector {
// \endcode
void computeLiveIn() {
DenseSet<Register> Used;
- for (const auto &MI : *OrigMBB) {
+ for (auto &MI : *OrigMBB) {
if (MI.isDebugInstr())
continue;
- for (const auto &MO : MI.all_uses()) {
- auto Use = MO.getReg();
+ for (auto Use : ROMap[&MI].Uses) {
+ auto Reg = Use.RegUnit;
// Ignore the variable that appears only on one side of phi instruction
// because it's used only at the first iteration.
- if (MI.isPHI() && Use != getLoopPhiReg(MI, OrigMBB))
+ if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
continue;
- if (isFixedRegister(Use))
+ if (isFixedRegister(Reg))
continue;
- if (isDefinedInThisLoop(Use))
+ if (isDefinedInThisLoop(Reg))
continue;
- Used.insert(Use);
+ Used.insert(Reg);
}
}
@@ -1333,9 +1335,8 @@ class HighRegisterPressureDetector {
Register Reg = getLoopPhiReg(*MI, OrigMBB);
UpdateTargetRegs(Reg);
} else {
- for (const auto &MO : MI->all_uses())
- if (MO.isReg())
- UpdateTargetRegs(MO.getReg());
+ for (auto Use : ROMap.find(MI)->getSecond().Uses)
+ UpdateTargetRegs(Use.RegUnit);
}
}
@@ -1345,10 +1346,8 @@ class HighRegisterPressureDetector {
DenseMap<Register, MachineInstr *> LastUseMI;
for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
- for (const auto &MO : MI->all_uses()) {
- if (!MO.isReg())
- continue;
- auto Reg = MO.getReg();
+ for (auto Use : ROMap.find(MI)->getSecond().Uses) {
+ auto Reg = Use.RegUnit;
if (!TargetRegs.contains(Reg))
continue;
auto Ite = LastUseMI.find(Reg);
@@ -1440,8 +1439,8 @@ class HighRegisterPressureDetector {
const unsigned Iter = I - Stage;
- for (auto &MO : MI->all_defs())
- InsertReg(LiveRegSets[Iter], MO.getReg());
+ for (auto Def : ROMap.find(MI)->getSecond().Defs)
+ InsertReg(LiveRegSets[Iter], Def.RegUnit);
for (auto LastUse : LastUses[MI]) {
if (MI->isPHI()) {
@@ -1479,6 +1478,12 @@ class HighRegisterPressureDetector {
// Used to calculate register pressure, which is independent of loop
// scheduling.
void init(const RegisterClassInfo &RCI) {
+ for (MachineInstr &MI : *OrigMBB) {
+ if (MI.isDebugInstr())
+ continue;
+ ROMap[&MI].collect(MI, *TRI, MRI, false, true);
+ }
+
computeLiveIn();
computePressureSetLimit(RCI);
}
diff --git a/llvm/test/CodeGen/PowerPC/sms-regpress.mir b/llvm/test/CodeGen/PowerPC/sms-regpress.mir
index d61ead2fc089a7b..f523b4548eecc91 100644
--- a/llvm/test/CodeGen/PowerPC/sms-regpress.mir
+++ b/llvm/test/CodeGen/PowerPC/sms-regpress.mir
@@ -14,9 +14,7 @@
# CHECK: Try to schedule with 25
# CHECK: Rejected the schedule because of too high register pressure
# CHECK: Try to schedule with 26
-# CHECK: Rejected the schedule because of too high register pressure
-# CHECK: Try to schedule with 27
-# CHECK: Schedule Found? 1 (II=27)
+# CHECK: Schedule Found? 1 (II=26)
--- |
; ModuleID = 'a.ll'
More information about the llvm-commits
mailing list