[llvm] [MachineScheduler] Experimental option to partially disable pre-ra scheduling. (PR #90181)
Jonas Paulsson via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 13 23:27:44 PDT 2024
https://github.com/JonPsson1 updated https://github.com/llvm/llvm-project/pull/90181
>From 5224a6b035033e3f47e0b4ce85eaa13ac7968c37 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Wed, 24 Apr 2024 17:21:42 +0200
Subject: [PATCH 1/2] -nosched-above
---
llvm/include/llvm/CodeGen/MachineScheduler.h | 17 +++++++++++++++++
llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h | 5 +++++
llvm/include/llvm/CodeGen/TargetSubtargetInfo.h | 10 ++++++++++
llvm/lib/CodeGen/MachineScheduler.cpp | 10 +++++++++-
llvm/lib/Target/SystemZ/SystemZSubtarget.cpp | 16 ++++++++++++++++
llvm/lib/Target/SystemZ/SystemZSubtarget.h | 6 ++++++
6 files changed, 63 insertions(+), 1 deletion(-)
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index b15abf040058e..fb22f7de0a562 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -241,6 +241,12 @@ class MachineSchedStrategy {
/// Tell the strategy that MBB is about to be processed.
virtual void enterMBB(MachineBasicBlock *MBB) {};
+ virtual bool disableForRegionPreRA(MachineBasicBlock::iterator begin,
+ MachineBasicBlock::iterator end,
+ unsigned regioninstrs) const {
+ return false;
+ }
+
/// Tell the strategy that current MBB is done.
virtual void leaveMBB() {};
@@ -487,6 +493,13 @@ class ScheduleDAGMILive : public ScheduleDAGMI {
MachineBasicBlock::iterator end,
unsigned regioninstrs) override;
+ bool disableForRegion(MachineBasicBlock *bb,
+ MachineBasicBlock::iterator begin,
+ MachineBasicBlock::iterator end,
+ unsigned regioninstrs) const override {
+ return SchedImpl->disableForRegionPreRA(begin, end, regioninstrs);
+ }
+
/// Implement ScheduleDAGInstrs interface for scheduling a sequence of
/// reorderable instructions.
void schedule() override;
@@ -1219,6 +1232,10 @@ class GenericScheduler : public GenericSchedulerBase {
void dumpPolicy() const override;
+ bool disableForRegionPreRA(MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ unsigned NumRegionInstrs) const override;
+
bool shouldTrackPressure() const override {
return RegionPolicy.ShouldTrackPressure;
}
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 32ff15fc75936..6594048b8f8a2 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -319,6 +319,11 @@ namespace llvm {
MachineBasicBlock::iterator end,
unsigned regioninstrs);
+ virtual bool disableForRegion(MachineBasicBlock *bb,
+ MachineBasicBlock::iterator begin,
+ MachineBasicBlock::iterator end,
+ unsigned regioninstrs) const { return false; }
+
/// Called when the scheduler has finished scheduling the current region.
virtual void exitRegion();
diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index 5023e29ce145a..9d74a2ec57804 100644
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -16,6 +16,7 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MacroFusion.h"
#include "llvm/CodeGen/PBQPRAConstraint.h"
#include "llvm/CodeGen/SchedulerRegistry.h"
@@ -229,6 +230,15 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
virtual void overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const {}
+ /// Allow the subtarget to leave a region untouched. This has purposefully
+ /// been left a bit untangled from other methods as this is hopefully
+ /// just a temporary solution.
+ virtual bool disableForRegionPreRA(MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ unsigned NumRegionInstrs) const {
+ return false;
+ }
+
// Perform target-specific adjustments to the latency of a schedule
// dependency.
// If a pair of operands is associated with the schedule dependency, DefOpIdx
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index cf72f74380835..38a8a1de70c2e 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -641,7 +641,8 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
Scheduler.enterRegion(&*MBB, I, RegionEnd, NumRegionInstrs);
// Skip empty scheduling regions (0 or 1 schedulable instructions).
- if (I == RegionEnd || I == std::prev(RegionEnd)) {
+ if (I == RegionEnd || I == std::prev(RegionEnd) ||
+ Scheduler.disableForRegion(&*MBB, I, RegionEnd, NumRegionInstrs)) {
// Close the current region. Bundle the terminator if needed.
// This invalidates 'RegionEnd' and 'I'.
Scheduler.exitRegion();
@@ -3336,6 +3337,13 @@ void GenericScheduler::dumpPolicy() const {
#endif
}
+bool GenericScheduler::disableForRegionPreRA(MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ unsigned NumRegionInstrs) const {
+ const MachineFunction &MF = *Begin->getMF();
+ return MF.getSubtarget().disableForRegionPreRA(Begin, End, NumRegionInstrs);
+}
+
/// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
/// critical path by more cycles than it takes to drain the instruction buffer.
/// We estimate an upper bounds on in-flight instructions as:
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index d0badd3692e40..e09ca747e4fe2 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -72,6 +72,22 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
TLInfo(TM, *this), FrameLowering(SystemZFrameLowering::create(*this)) {}
+
+// EXPERIMENTAL
+cl::opt<unsigned> NoSchedAbove("nosched-above", cl::init(~0U));
+bool SystemZSubtarget::disableForRegionPreRA(MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ unsigned NumRegionInstrs) const {
+ // It seems that the generic scheduler currently can increase spilling heavily
+ // with big / huge regions. Disable it until it is fixed.
+ if (NumRegionInstrs > NoSchedAbove) {
+ LLVM_DEBUG(dbgs() << "Disabling pre-ra mischeduling of region with "
+ << NumRegionInstrs << " instructions\n";);
+ return true;
+ }
+ return false;
+}
+
bool SystemZSubtarget::enableSubRegLiveness() const {
return UseSubRegLiveness;
}
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
index 5fa7c8f194ebf..c5749405cc71a 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
@@ -89,6 +89,12 @@ class SystemZSubtarget : public SystemZGenSubtargetInfo {
// "source" order scheduler.
bool enableMachineScheduler() const override { return true; }
+ // Don't use pre-ra mischeduler for huge regions where it creates a lot of
+ // spilling (temporary solution).
+ bool disableForRegionPreRA(MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ unsigned NumRegionInstrs) const override;
+
// This is important for reducing register pressure in vector code.
bool useAA() const override { return true; }
>From bae21d51a33c66672e3a67dc919a6b23fa266b09 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 26 Apr 2024 08:53:31 +0200
Subject: [PATCH 2/2] Some heuristics for OOO scheduling
---
llvm/include/llvm/CodeGen/MachineScheduler.h | 44 +-
llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h | 4 +-
.../llvm/CodeGen/TargetSubtargetInfo.h | 10 -
llvm/lib/CodeGen/MachineScheduler.cpp | 411 +++++++++++++++++-
llvm/lib/Target/SystemZ/SystemZSubtarget.cpp | 16 -
llvm/lib/Target/SystemZ/SystemZSubtarget.h | 6 -
6 files changed, 434 insertions(+), 57 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index fb22f7de0a562..6a0c886cbd78e 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -94,6 +94,7 @@
#include <cassert>
#include <llvm/Support/raw_ostream.h>
#include <memory>
+#include <set>
#include <string>
#include <vector>
@@ -241,12 +242,6 @@ class MachineSchedStrategy {
/// Tell the strategy that MBB is about to be processed.
virtual void enterMBB(MachineBasicBlock *MBB) {};
- virtual bool disableForRegionPreRA(MachineBasicBlock::iterator begin,
- MachineBasicBlock::iterator end,
- unsigned regioninstrs) const {
- return false;
- }
-
/// Tell the strategy that current MBB is done.
virtual void leaveMBB() {};
@@ -496,9 +491,7 @@ class ScheduleDAGMILive : public ScheduleDAGMI {
bool disableForRegion(MachineBasicBlock *bb,
MachineBasicBlock::iterator begin,
MachineBasicBlock::iterator end,
- unsigned regioninstrs) const override {
- return SchedImpl->disableForRegionPreRA(begin, end, regioninstrs);
- }
+ unsigned regioninstrs) const override;
/// Implement ScheduleDAGInstrs interface for scheduling a sequence of
/// reorderable instructions.
@@ -1084,7 +1077,7 @@ class GenericSchedulerBase : public MachineSchedStrategy {
enum CandReason : uint8_t {
NoCand, Only1, PhysReg, RegExcess, RegCritical, Stall, Cluster, Weak,
RegMax, ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
- TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
+ TopDepthReduce, TopPathReduce, NextDefUse, RegPressure, NodeOrder};
#ifndef NDEBUG
static const char *getReasonStr(GenericSchedulerBase::CandReason Reason);
@@ -1221,6 +1214,33 @@ int biasPhysReg(const SUnit *SU, bool isTop);
/// GenericScheduler shrinks the unscheduled zone using heuristics to balance
/// the schedule.
class GenericScheduler : public GenericSchedulerBase {
+ //// Experimental members for OOO scheduling. ////
+
+ // TODO: Integrate with SchedDFSResult class.
+ // SU -> Nodes above in subtree.
+ std::vector<std::set<const SUnit *> > TreeSUs;
+ // SU -> Virtual regs defined above in subtree.
+ std::vector<std::set<Register> > TreeDefs;
+ // SU -> Regs used but not defined above in subtree.
+ std::vector<std::set<Register> > TreeUses;
+
+ // If this SU is non-null, it is the start of a subtree to be scheduled as
+ // a unit.
+ mutable SUnit *NextSubtreeSU = nullptr;
+ // A (small) set of instructions to be scheduled next as a unit.
+ std::set<const SUnit *> NextQueue;
+
+ unsigned DAGHeight;
+ unsigned DAGDepth;
+ unsigned NumScheduled;
+ std::set<Register> LiveRegs; // Currently live registers.
+
+ void initLiveRegs(ScheduleDAGMILive *DAG);
+ void getMIPDiff(const MachineInstr *MI, PressureDiff &PDiff) const;
+ void getTreePDiff(unsigned NodeNum, PressureDiff &PDiff) const;
+ int comparePDiffs(PressureDiff &PDiff1, PressureDiff &PDiff2) const;
+ //// ////
+
public:
GenericScheduler(const MachineSchedContext *C):
GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ"),
@@ -1232,10 +1252,6 @@ class GenericScheduler : public GenericSchedulerBase {
void dumpPolicy() const override;
- bool disableForRegionPreRA(MachineBasicBlock::iterator Begin,
- MachineBasicBlock::iterator End,
- unsigned NumRegionInstrs) const override;
-
bool shouldTrackPressure() const override {
return RegionPolicy.ShouldTrackPressure;
}
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 6594048b8f8a2..789948acedcab 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -322,7 +322,9 @@ namespace llvm {
virtual bool disableForRegion(MachineBasicBlock *bb,
MachineBasicBlock::iterator begin,
MachineBasicBlock::iterator end,
- unsigned regioninstrs) const { return false; }
+ unsigned regioninstrs) const {
+ return false;
+ }
/// Called when the scheduler has finished scheduling the current region.
virtual void exitRegion();
diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index 9d74a2ec57804..5023e29ce145a 100644
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -16,7 +16,6 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MacroFusion.h"
#include "llvm/CodeGen/PBQPRAConstraint.h"
#include "llvm/CodeGen/SchedulerRegistry.h"
@@ -230,15 +229,6 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
virtual void overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const {}
- /// Allow the subtarget to leave a region untouched. This has purposefully
- /// been left a bit untangled from other methods as this is hopefully
- /// just a temporary solution.
- virtual bool disableForRegionPreRA(MachineBasicBlock::iterator Begin,
- MachineBasicBlock::iterator End,
- unsigned NumRegionInstrs) const {
- return false;
- }
-
// Perform target-specific adjustments to the latency of a schedule
// dependency.
// If a pair of operands is associated with the schedule dependency, DefOpIdx
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 38a8a1de70c2e..f071dc40c3724 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1253,6 +1253,23 @@ void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb,
"ShouldTrackLaneMasks requires ShouldTrackPressure");
}
+// EXPERIMENTAL: It seems that GenericScheduler currently often increases
+// spilling heavily with huge regions (like >350 instructions). This option
+// makes any sched region bigger than its value have pre-ra scheduling
+// skipped.
+cl::opt<unsigned> NoSchedAbove("nosched-above", cl::init(~0U));
+bool ScheduleDAGMILive::disableForRegion(MachineBasicBlock *bb,
+ MachineBasicBlock::iterator begin,
+ MachineBasicBlock::iterator end,
+ unsigned regioninstrs) const {
+ if (NumRegionInstrs > NoSchedAbove) {
+ LLVM_DEBUG(dbgs() << "Disabling pre-ra mischeduling of region with "
+ << NumRegionInstrs << " instructions\n";);
+ return true;
+ }
+ return false;
+}
+
// Setup the register pressure trackers for the top scheduled and bottom
// scheduled regions.
void ScheduleDAGMILive::initRegPressure() {
@@ -2455,7 +2472,53 @@ SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx,
/// can dispatch per cycle.
///
/// TODO: Also check whether the SU must start a new group.
+
+// EXPERIMENTAL: General switch for experimental OOO heuristics.
+// Note: This will (currently) enforce bottom-up only scheduling for affected
+// regions.
+cl::opt<bool> OOOSCHED("misched-ooo", cl::init(false));
+
+// EXPERIMENTAL: Similar to NoSchedAbove. Disables OOO heuristics for smaller
+// regions below the limit.
+cl::opt<unsigned> NoOOOSchedBelow("no-ooosched-below", cl::init(0));
+
+// EXPERIMENTAL: Disable cycle hazards for OOO target.
+cl::opt<bool> NOHAZARDS("misched-nohazards", cl::init(true));
+
+// EXPERIMENTAL: Used for verification purposes (disabling rescheduling).
+cl::opt<bool> INPUTORDER("misched-inputorder", cl::init(false));
+
+// EXPERIMENTAL: Enable regpressure heuristics for OOO scheduling.
+cl::opt<bool> REGPRESS("misched-regpress", cl::init(true));
+
+// EXPERIMENTAL: Use DFSResult for regpressure heuristics with OOO scheduling.
+cl::opt<bool> DFS("misched-dfs", cl::init(true));
+
+// EXPERIMENTAL: The size limit of subtrees to schedule as a unit.
+cl::opt<unsigned> DFSSIZE("dfs-size", cl::init(4));
+
+// EXPERIMENTAL: Enable height heuristic for OOO scheduling.
+cl::opt<bool> HEIGHTHEUR("misched-heightheur", cl::init(true));
+
+// EXPERIMENTAL: Rough differentiator for height heuristic: If the DAG size /
+// height is greater/eq than this value, then it is too "wide" to enable the
+// height heuristic.
+cl::opt<unsigned> HEIGHTIFWFAC("misched-heightifwfac", cl::init(3));
+
+static bool doOOOSchedForRegion(unsigned NumRegionInstrs) {
+ return OOOSCHED && NumRegionInstrs > NoOOOSchedBelow;
+}
+
+static bool doHeightHeurForRegion(const ScheduleDAGMI *DAG, unsigned DAGHeight) {
+ return HEIGHTHEUR &&
+ DAGHeight != 0 && (DAG->SUnits.size() / DAGHeight) < HEIGHTIFWFAC;
+}
+
bool SchedBoundary::checkHazard(SUnit *SU) {
+ // Better to make SU available and potentially reduce register pressure.
+ if ((doOOOSchedForRegion(DAG->SUnits.size()) && NOHAZARDS) || INPUTORDER)
+ return false;
+
if (HazardRec->isEnabled()
&& HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) {
return true;
@@ -2692,9 +2755,9 @@ void SchedBoundary::bumpNode(SUnit *SU) {
// exceed the issue width.
const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr());
- assert(
- (CurrMOps == 0 || (CurrMOps + IncMOps) <= SchedModel->getIssueWidth()) &&
- "Cannot schedule this instruction's MicroOps in the current cycle.");
+ assert((doOOOSchedForRegion(DAG->SUnits.size()) || INPUTORDER ||
+ (CurrMOps == 0 || (CurrMOps + IncMOps) <= SchedModel->getIssueWidth())) &&
+ "Cannot schedule this instruction's MicroOps in the current cycle.");
unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
LLVM_DEBUG(dbgs() << " Ready @" << ReadyCycle << "c\n");
@@ -3109,6 +3172,7 @@ const char *GenericSchedulerBase::getReasonStr(
case BotHeightReduce:return "BOT-HEIGHT";
case BotPathReduce: return "BOT-PATH ";
case NextDefUse: return "DEF-USE ";
+ case RegPressure: return "REG-PRESS ";
case NodeOrder: return "ORDER ";
};
llvm_unreachable("Unknown reason!");
@@ -3246,6 +3310,20 @@ static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) {
tracePick(Cand.Reason, Cand.AtTop);
}
+// Skip all operands that are not interesting registers from a scheduling
+// register pressure persepctive.
+bool skipOp(const MachineOperand &Op, const MachineRegisterInfo *MRI,
+ bool SkipPhysRegs = true) {
+ if (!Op.isReg() || !Op.getReg() || Op.isImplicit() ||
+ (Op.isUse() && Op.isUndef()) || Op.isDead())
+ return true;
+
+ if (Register::isPhysicalRegister(Op.getReg()))
+ return SkipPhysRegs ? true : !MRI->isAllocatable(Op.getReg());
+
+ return false;
+}
+
void GenericScheduler::initialize(ScheduleDAGMI *dag) {
assert(dag->hasVRegLiveness() &&
"(PreRA)GenericScheduler needs vreg liveness");
@@ -3256,10 +3334,59 @@ void GenericScheduler::initialize(ScheduleDAGMI *dag) {
if (RegionPolicy.ComputeDFSResult)
DAG->computeDFSResult();
+ if (DFS) {
+ DAG->computeDFSResult();
+ const SchedDFSResult *DFSResult = DAG->getDFSResult();
+ TreeDefs = std::vector<std::set<Register> > (DAG->SUnits.size());
+ TreeUses = std::vector<std::set<Register> > (DAG->SUnits.size());
+ TreeSUs = std::vector<std::set<const SUnit *> >(DAG->SUnits.size());
+ assert(NextSubtreeSU == nullptr);
+
+ for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
+ const SUnit *SU = &DAG->SUnits[Idx];
+ TreeSUs[Idx].insert(SU);
+ if (SU->getInstr()->getNumOperands()) {
+ const MachineOperand &MO = SU->getInstr()->getOperand(0);
+ if (!skipOp(MO, &DAG->MRI) && MO.isDef())
+ TreeDefs[Idx].insert(MO.getReg());
+ }
+
+ // Propagate from above in subtree.
+ unsigned SUTree = DFSResult->getSubtreeID(SU);
+ for (const SDep &Pred : SU->Preds) {
+ const SUnit *PredSU = Pred.getSUnit();
+ if (PredSU->isBoundaryNode())
+ continue;
+ unsigned PI = PredSU->NodeNum;
+ if (DFSResult->getSubtreeID(PredSU) == SUTree) {
+ TreeSUs[Idx].insert(TreeSUs[PI].begin(), TreeSUs[PI].end());
+ TreeDefs[Idx].insert(TreeDefs[PI].begin(), TreeDefs[PI].end());
+ TreeUses[Idx].insert(TreeUses[PI].begin(), TreeUses[PI].end());
+ }
+ }
+
+ // Virtual register uses not defined in subtree. Avoid involvement
+ // with physregs.
+ for (auto &MO : SU->getInstr()->explicit_operands())
+ if (!skipOp(MO, &DAG->MRI, /*SkipPhysRegs=*/false) && MO.isUse() &&
+ !TreeDefs[Idx].count(MO.getReg()))
+ TreeUses[Idx].insert(MO.getReg());
+ }
+ }
+
Rem.init(DAG, SchedModel);
Top.init(DAG, SchedModel, &Rem);
Bot.init(DAG, SchedModel, &Rem);
+ DAGHeight = 0;
+ DAGDepth = 0;
+ for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
+ DAGHeight = std::max(DAGHeight, DAG->SUnits[Idx].getHeight());
+ DAGDepth = std::max(DAGDepth, DAG->SUnits[Idx].getDepth());
+ }
+ NumScheduled = 0;
+ initLiveRegs(DAG);
+
// Initialize resource counts.
// Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
@@ -3275,6 +3402,23 @@ void GenericScheduler::initialize(ScheduleDAGMI *dag) {
BotCand.SU = nullptr;
}
+void GenericScheduler::initLiveRegs(ScheduleDAGMILive *DAG) {
+ LiveRegs.clear();
+ // TODO: Is this slow...?
+ for (unsigned I = 0, E = DAG->MRI.getNumVirtRegs(); I != E; ++I) {
+ Register VirtReg = Register::index2VirtReg(I);
+ const LiveInterval &LI = DAG->getLIS()->getInterval(VirtReg);
+ LiveQueryResult LRQ = LI.Query(DAG->getLIS()->
+ getInstructionIndex(*DAG->SUnits.back().getInstr()));
+ if (LRQ.valueOut())
+ LiveRegs.insert(VirtReg);
+ }
+ LLVM_DEBUG( dbgs() << " Live outs: ";
+ for (auto Reg : LiveRegs)
+ dbgs() << "%" << Register::virtReg2Index(Reg) << ", ";
+ dbgs() << "\n";);
+}
+
/// Initialize the per-region scheduling policy.
void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
MachineBasicBlock::iterator End,
@@ -3324,6 +3468,11 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
if (RegionPolicy.OnlyTopDown)
RegionPolicy.OnlyBottomUp = false;
}
+
+ if (doOOOSchedForRegion(NumRegionInstrs)) {
+ RegionPolicy.OnlyBottomUp = true;
+ RegionPolicy.OnlyTopDown = false;
+ }
}
void GenericScheduler::dumpPolicy() const {
@@ -3337,13 +3486,6 @@ void GenericScheduler::dumpPolicy() const {
#endif
}
-bool GenericScheduler::disableForRegionPreRA(MachineBasicBlock::iterator Begin,
- MachineBasicBlock::iterator End,
- unsigned NumRegionInstrs) const {
- const MachineFunction &MF = *Begin->getMF();
- return MF.getSubtarget().disableForRegionPreRA(Begin, End, NumRegionInstrs);
-}
-
/// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
/// critical path by more cycles than it takes to drain the instruction buffer.
/// We estimate an upper bounds on in-flight instructions as:
@@ -3486,6 +3628,76 @@ int biasPhysReg(const SUnit *SU, bool isTop) {
return 0;
}
+
+// Compute the current PressureDiff for MI and return it in PDiff.
+void GenericScheduler::getMIPDiff(const MachineInstr *MI, PressureDiff &PDiff) const {
+ std::set<Register> Kills, Defs;
+
+ for (auto &MO : MI->explicit_operands()) {
+ if (skipOp(MO, &DAG->MRI))
+ continue;
+ const LiveInterval &LI = DAG->getLIS()->getInterval(MO.getReg());
+ LiveQueryResult LRQ = LI.Query(DAG->getLIS()->getInstructionIndex(*MI));
+ if (MO.isUse() && !LiveRegs.count(MO.getReg()))
+ Kills.insert(MO.getReg());
+ else if (MO.isDef() && LRQ.valueOut() != nullptr && LRQ.valueIn() == nullptr)
+ Defs.insert(MO.getReg());
+ }
+
+ for (auto &Kill : Kills)
+ PDiff.addPressureChange(Kill, false/*IsDec*/, &DAG->MRI);
+ for (auto &Def : Defs)
+ PDiff.addPressureChange(Def, true/*IsDec*/, &DAG->MRI);
+}
+
+// Compute the current PressureDiff for a subtree beginning at the NodeNum SU.
+void GenericScheduler::getTreePDiff(unsigned NodeNum,
+ PressureDiff &PDiff) const {
+ // Only consider relatively small subtrees.
+ if (TreeSUs[NodeNum].size() <= 1 || TreeSUs[NodeNum].size() > DFSSIZE)
+ return;
+
+ // Don't schedule a subtree if it would cause a register to become live.
+ for (auto &Reg : TreeUses[NodeNum])
+ if (!LiveRegs.count(Reg))
+ return;
+
+ // Check that this is a subtree that can be scheduled as a unit.
+ for (auto *TreeSU : TreeSUs[NodeNum])
+ for (const SDep &Succ : TreeSU->Succs)
+ if (!Succ.getSUnit()->isScheduled &&
+ !TreeSUs[NodeNum].count(Succ.getSUnit()))
+ return;
+
+ // Return the PressureDiff counting the currently live registers that this
+ // subtree defines (and are not redefining).
+ for (auto R : TreeDefs[NodeNum])
+ if (LiveRegs.count(R) && !TreeUses[NodeNum].count(R))
+ PDiff.addPressureChange(R, true/*IsDec*/, &DAG->MRI);
+}
+
+// Compare two pressure diffs and retun a non-zero value only in cases where
+// one is increasing while the other is decreasing the same pressure set. The
+// returned value will reflect PDiff2 as in being negative if it decreases
+// pressure.
+int GenericScheduler::comparePDiffs(PressureDiff &PDiff1,
+ PressureDiff &PDiff2) const {
+ int RPScore = 0;
+ for (const PressureChange &PC1 : PDiff1) {
+ if (!PC1.isValid())
+ break;
+ for (const PressureChange &PC2 : PDiff2) {
+ if (!PC2.isValid())
+ break;
+ if (PC1.getPSet() == PC2.getPSet() &&
+ (PC2.getUnitInc() < 0) != (PC1.getUnitInc() < 0)) {
+ RPScore += PC2.getUnitInc() < 0 ? -1 : 1;
+ break;
+ }
+ }
+ }
+ return RPScore;
+}
} // end namespace llvm
void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
@@ -3525,6 +3737,18 @@ void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
<< Cand.RPDelta.Excess.getUnitInc() << "\n");
}
+#ifndef NDEBUG
+// Dump PressureDiffs for SU and Subtree.
+void dumpPDiffs_SU_STree(unsigned SUNodeNum, unsigned SubtreeNodeNum,
+ PressureDiff &SUPDiff, PressureDiff &SubtreePDiff,
+ const TargetRegisterInfo *TRI) {
+ dbgs() << "SU(" << SUNodeNum << ") PDiff: \t";
+ SUPDiff.dump(*TRI);
+ dbgs() << "Subtree starting with SU(" << SubtreeNodeNum << ") PDiff: \t";
+ SubtreePDiff.dump(*TRI);
+}
+#endif
+
/// Apply a set of heuristics to a new candidate. Heuristics are currently
/// hierarchical. This may be more efficient than a graduated cost model because
/// we don't need to evaluate all aspects of the model for each node in the
@@ -3545,11 +3769,120 @@ bool GenericScheduler::tryCandidate(SchedCandidate &Cand,
return true;
}
+ // Experimental: produce the same output order as in the input.
+ if (INPUTORDER) {
+ if (TryCand.SU->NodeNum > Cand.SU->NodeNum) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+ return false;
+ }
+
// Bias PhysReg Defs and copies to their uses and defined respectively.
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
return TryCand.Reason != NoCand;
+ // Experimental scheduling for OOO target.
+ if (doOOOSchedForRegion(DAG->SUnits.size())) {
+
+ bool SkipPhysRegs = biasPhysReg(TryCand.SU, TryCand.AtTop) &&
+ biasPhysReg(Cand.SU, TryCand.AtTop);
+
+ if (REGPRESS && !SkipPhysRegs) {
+ // Schedule from NextQueue until it's empty.
+ if (DFS && !NextQueue.empty() &&
+ tryGreater(NextQueue.count(TryCand.SU), NextQueue.count(Cand.SU),
+ TryCand, Cand, GenericSchedulerBase::RegPressure))
+ return TryCand.Reason != NoCand;
+
+ // Compare pressure diffs between the candidates and schedule an
+ // instruction that decreases register pressure such as an immediate
+ // load below the other.
+ PressureDiff CandMIPDiff;
+ PressureDiff TryCandMIPDiff;
+ getMIPDiff(Cand.SU->getInstr(), CandMIPDiff);
+ getMIPDiff(TryCand.SU->getInstr(), TryCandMIPDiff);
+ if (int TryCandRPScore = comparePDiffs(CandMIPDiff, TryCandMIPDiff)) {
+ LLVM_DEBUG(dbgs() << "SU(" << Cand.SU->NodeNum << ") PDiff: \t";
+ CandMIPDiff.dump(*TRI);
+ dbgs() << "SU(" << TryCand.SU->NodeNum << ") PDiff: \t";
+ TryCandMIPDiff.dump(*TRI););
+ tryLess(TryCandRPScore, 0, TryCand, Cand, GenericSchedulerBase::RegPressure);
+ return TryCand.Reason != NoCand;
+ }
+
+ // See if there is a subtree that would reduce register pressure if
+ // scheduled.
+ if (DFS && NextQueue.empty()) {
+ PressureDiff TryCandTreePDiff;
+ getTreePDiff(TryCand.SU->NodeNum, TryCandTreePDiff);
+ if (comparePDiffs(CandMIPDiff, TryCandTreePDiff) < 0) {
+ LLVM_DEBUG(dumpPDiffs_SU_STree(Cand.SU->NodeNum, TryCand.SU->NodeNum,
+ CandMIPDiff, TryCandTreePDiff, TRI););
+ TryCand.Reason = GenericSchedulerBase::RegPressure;
+ NextSubtreeSU = TryCand.SU;
+ return true;
+ }
+ PressureDiff CandTreePDiff;
+ getTreePDiff(Cand.SU->NodeNum, CandTreePDiff);
+ if (comparePDiffs(TryCandMIPDiff, CandTreePDiff) < 0) {
+ LLVM_DEBUG(dumpPDiffs_SU_STree(TryCand.SU->NodeNum, Cand.SU->NodeNum,
+ TryCandMIPDiff, CandTreePDiff, TRI););
+ Cand.Reason = GenericSchedulerBase::RegPressure;
+ NextSubtreeSU = Cand.SU;
+ return false;
+ }
+ }
+
+ // An SU that only increments register pressure (bottom-up) would help
+ // register pressure if scheduled higher (e.g. a store). Don't push all
+ // it's predecessor further up, but at least make sure that the SU is
+ // scheduled immediately after its predecessor in the input order.
+ auto onlyIncreases = [&](PressureDiff &PDiff) -> bool {
+ bool Incr = false;
+ bool Decr = false;
+ for (const PressureChange &PC : PDiff) {
+ if (!PC.isValid())
+ break;
+ (PC.getUnitInc() > 0 ? Incr : Decr) = true;
+ }
+ return Incr && !Decr;
+ };
+ auto maxPredNum = [&](const SUnit *SU) -> unsigned {
+ unsigned MaxPredNodeNum = 0;
+ for (const SDep &Pred : SU->Preds)
+ if (Pred.getSUnit() != &DAG->EntrySU &&
+ Pred.getSUnit()->NodeNum > MaxPredNodeNum)
+ MaxPredNodeNum = Pred.getSUnit()->NodeNum;
+ return MaxPredNodeNum;
+ };
+ bool TryCandIncr = onlyIncreases(TryCandMIPDiff);
+ bool CandIncr = onlyIncreases(CandMIPDiff);
+ if (TryCandIncr != CandIncr) {
+ bool TryCandHeur = (TryCandIncr &&
+ maxPredNum(TryCand.SU) < Cand.SU->NodeNum);
+ bool CandHeur = (CandIncr &&
+ maxPredNum(Cand.SU) < TryCand.SU->NodeNum);
+ if (tryLess(TryCandHeur, CandHeur, TryCand, Cand,
+ GenericSchedulerBase::RegPressure))
+ return TryCand.Reason != NoCand;
+ }
+ }
+
+ if (!SkipPhysRegs && doHeightHeurForRegion(DAG, DAGHeight) &&
+ tryLatency(TryCand, Cand, *Zone))
+ return TryCand.Reason != NoCand;
+
+ // Fall through to original instruction order.
+ if (TryCand.SU->NodeNum > Cand.SU->NodeNum) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ return false;
+ }
+
// Avoid exceeding the target's limit.
if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
Cand.RPDelta.Excess,
@@ -3654,6 +3987,11 @@ void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
// getMaxPressureDelta temporarily modifies the tracker.
RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+ LLVM_DEBUG( dbgs() << "Live regs: ";
+ for (auto R : LiveRegs)
+ dbgs() << "%" << Register::virtReg2Index(R) << ", ";
+ dbgs() << "\n";);
+
ReadyQueue &Q = Zone.Available;
for (SUnit *SU : Q) {
@@ -3853,6 +4191,59 @@ void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
if (SU->hasPhysRegDefs)
reschedulePhysReg(SU, false);
}
+
+ MachineInstr *MI = SU->getInstr();
+ for (auto &MO : MI->explicit_operands())
+ if (!skipOp(MO, &DAG->MRI)) {
+ if (MO.isDef()) {
+ assert(LiveRegs.count(MO.getReg()) || MO.isDead());
+ if (!MO.getSubReg()) {
+ LiveRegs.erase(MO.getReg());
+ } else {
+ const LiveInterval &LI = DAG->getLIS()->getInterval(MO.getReg());
+ LiveQueryResult LRQ =
+ LI.Query(DAG->getLIS()->getInstructionIndex(*MI));
+ if(!LRQ.valueIn())
+ LiveRegs.erase(MO.getReg());
+ }
+ }
+ else if (MO.readsReg())
+ LiveRegs.insert(MO.getReg());
+ }
+ ++NumScheduled;
+
+ if (NextSubtreeSU) {
+ assert(NextQueue.empty());
+ if (NextSubtreeSU == SU) {
+ for (auto *TSU : TreeSUs[SU->NodeNum])
+ if (!TSU->isScheduled)
+ NextQueue.insert(TSU);
+ LLVM_DEBUG(dbgs() << "Scheduling subtree: ";
+ for (auto *NxSU : NextQueue)
+ dbgs() << NxSU->NodeNum << " ";
+ dbgs() << "\n";);
+ }
+ NextSubtreeSU = nullptr;
+ }
+
+ if (!NextQueue.empty()) {
+ assert (NextQueue.count(SU) && "Failed to schedule planned SU.");
+ NextQueue.erase(SU);
+#ifndef NDEBUG
+ const SchedDFSResult *DFSResult = DAG->getDFSResult();
+ unsigned SUTree = DFSResult->getSubtreeID(SU);
+ for (const SDep &Pred : SU->Preds) {
+ const SUnit *PredSU = Pred.getSUnit();
+ assert((PredSU->isBoundaryNode() ||
+ Pred.getKind() != SDep::Data ||
+ (DFSResult->getSubtreeID(PredSU) == SUTree &&
+ NextQueue.count(PredSU)) ||
+ LiveRegs.count(Pred.getReg()) ||
+ Register::isPhysicalRegister(Pred.getReg())) &&
+ "Expected no data edges exiting the subtree.");
+ }
+#endif
+ }
}
/// Create the standard converging machine scheduler. This will be used as the
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index e09ca747e4fe2..d0badd3692e40 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -72,22 +72,6 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
TLInfo(TM, *this), FrameLowering(SystemZFrameLowering::create(*this)) {}
-
-// EXPERIMENTAL
-cl::opt<unsigned> NoSchedAbove("nosched-above", cl::init(~0U));
-bool SystemZSubtarget::disableForRegionPreRA(MachineBasicBlock::iterator Begin,
- MachineBasicBlock::iterator End,
- unsigned NumRegionInstrs) const {
- // It seems that the generic scheduler currently can increase spilling heavily
- // with big / huge regions. Disable it until it is fixed.
- if (NumRegionInstrs > NoSchedAbove) {
- LLVM_DEBUG(dbgs() << "Disabling pre-ra mischeduling of region with "
- << NumRegionInstrs << " instructions\n";);
- return true;
- }
- return false;
-}
-
bool SystemZSubtarget::enableSubRegLiveness() const {
return UseSubRegLiveness;
}
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
index c5749405cc71a..5fa7c8f194ebf 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
@@ -89,12 +89,6 @@ class SystemZSubtarget : public SystemZGenSubtargetInfo {
// "source" order scheduler.
bool enableMachineScheduler() const override { return true; }
- // Don't use pre-ra mischeduler for huge regions where it creates a lot of
- // spilling (temporary solution).
- bool disableForRegionPreRA(MachineBasicBlock::iterator Begin,
- MachineBasicBlock::iterator End,
- unsigned NumRegionInstrs) const override;
-
// This is important for reducing register pressure in vector code.
bool useAA() const override { return true; }
More information about the llvm-commits
mailing list