[llvm] [MachineScheduler] Experimental option to partially disable pre-ra scheduling. (PR #90181)

Jonas Paulsson via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 13 23:27:44 PDT 2024


https://github.com/JonPsson1 updated https://github.com/llvm/llvm-project/pull/90181

>From 5224a6b035033e3f47e0b4ce85eaa13ac7968c37 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Wed, 24 Apr 2024 17:21:42 +0200
Subject: [PATCH 1/2] -nosched-above

---
 llvm/include/llvm/CodeGen/MachineScheduler.h    | 17 +++++++++++++++++
 llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h   |  5 +++++
 llvm/include/llvm/CodeGen/TargetSubtargetInfo.h | 10 ++++++++++
 llvm/lib/CodeGen/MachineScheduler.cpp           | 10 +++++++++-
 llvm/lib/Target/SystemZ/SystemZSubtarget.cpp    | 16 ++++++++++++++++
 llvm/lib/Target/SystemZ/SystemZSubtarget.h      |  6 ++++++
 6 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index b15abf040058e..fb22f7de0a562 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -241,6 +241,12 @@ class MachineSchedStrategy {
   /// Tell the strategy that MBB is about to be processed.
   virtual void enterMBB(MachineBasicBlock *MBB) {};
 
+  virtual bool disableForRegionPreRA(MachineBasicBlock::iterator begin,
+                                     MachineBasicBlock::iterator end,
+                                     unsigned regioninstrs) const {
+    return false;
+  }
+
   /// Tell the strategy that current MBB is done.
   virtual void leaveMBB() {};
 
@@ -487,6 +493,13 @@ class ScheduleDAGMILive : public ScheduleDAGMI {
                    MachineBasicBlock::iterator end,
                    unsigned regioninstrs) override;
 
+  bool disableForRegion(MachineBasicBlock *bb,
+                        MachineBasicBlock::iterator begin,
+                        MachineBasicBlock::iterator end,
+                        unsigned regioninstrs) const override {
+    return SchedImpl->disableForRegionPreRA(begin, end, regioninstrs);
+  }
+
   /// Implement ScheduleDAGInstrs interface for scheduling a sequence of
   /// reorderable instructions.
   void schedule() override;
@@ -1219,6 +1232,10 @@ class GenericScheduler : public GenericSchedulerBase {
 
   void dumpPolicy() const override;
 
+  bool disableForRegionPreRA(MachineBasicBlock::iterator Begin,
+                             MachineBasicBlock::iterator End,
+                             unsigned NumRegionInstrs) const override;
+
   bool shouldTrackPressure() const override {
     return RegionPolicy.ShouldTrackPressure;
   }
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 32ff15fc75936..6594048b8f8a2 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -319,6 +319,11 @@ namespace llvm {
                              MachineBasicBlock::iterator end,
                              unsigned regioninstrs);
 
+    virtual bool disableForRegion(MachineBasicBlock *bb,
+                                  MachineBasicBlock::iterator begin,
+                                  MachineBasicBlock::iterator end,
+                                  unsigned regioninstrs) const { return false; }
+
     /// Called when the scheduler has finished scheduling the current region.
     virtual void exitRegion();
 
diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index 5023e29ce145a..9d74a2ec57804 100644
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MacroFusion.h"
 #include "llvm/CodeGen/PBQPRAConstraint.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
@@ -229,6 +230,15 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
   virtual void overrideSchedPolicy(MachineSchedPolicy &Policy,
                                    unsigned NumRegionInstrs) const {}
 
+  /// Allow the subtarget to leave a region untouched. This has purposefully
+  /// been left a bit untangled from other methods as this is hopefully
+  /// just a temporary solution.
+  virtual bool disableForRegionPreRA(MachineBasicBlock::iterator Begin,
+                                     MachineBasicBlock::iterator End,
+                                     unsigned NumRegionInstrs) const {
+    return false;
+  }
+
   // Perform target-specific adjustments to the latency of a schedule
   // dependency.
   // If a pair of operands is associated with the schedule dependency, DefOpIdx
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index cf72f74380835..38a8a1de70c2e 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -641,7 +641,8 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
       Scheduler.enterRegion(&*MBB, I, RegionEnd, NumRegionInstrs);
 
       // Skip empty scheduling regions (0 or 1 schedulable instructions).
-      if (I == RegionEnd || I == std::prev(RegionEnd)) {
+      if (I == RegionEnd || I == std::prev(RegionEnd) ||
+          Scheduler.disableForRegion(&*MBB, I, RegionEnd, NumRegionInstrs)) {
         // Close the current region. Bundle the terminator if needed.
         // This invalidates 'RegionEnd' and 'I'.
         Scheduler.exitRegion();
@@ -3336,6 +3337,13 @@ void GenericScheduler::dumpPolicy() const {
 #endif
 }
 
+bool GenericScheduler::disableForRegionPreRA(MachineBasicBlock::iterator Begin,
+                                             MachineBasicBlock::iterator End,
+                                             unsigned NumRegionInstrs) const {
+  const MachineFunction &MF = *Begin->getMF();
+  return MF.getSubtarget().disableForRegionPreRA(Begin, End, NumRegionInstrs);
+}
+
 /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
 /// critical path by more cycles than it takes to drain the instruction buffer.
 /// We estimate an upper bounds on in-flight instructions as:
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index d0badd3692e40..e09ca747e4fe2 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -72,6 +72,22 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
       InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
       TLInfo(TM, *this), FrameLowering(SystemZFrameLowering::create(*this)) {}
 
+
+// EXPERIMENTAL
+cl::opt<unsigned> NoSchedAbove("nosched-above", cl::init(~0U));
+bool SystemZSubtarget::disableForRegionPreRA(MachineBasicBlock::iterator Begin,
+                                             MachineBasicBlock::iterator End,
+                                             unsigned NumRegionInstrs) const {
+  // It seems that the generic scheduler currently can increase spilling heavily
+  // with big / huge regions. Disable it until it is fixed.
+  if (NumRegionInstrs > NoSchedAbove) {
+    LLVM_DEBUG(dbgs() << "Disabling pre-ra mischeduling of region with "
+                      << NumRegionInstrs << " instructions\n";);
+    return true;
+  }
+  return false;
+}
+
 bool SystemZSubtarget::enableSubRegLiveness() const {
   return UseSubRegLiveness;
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
index 5fa7c8f194ebf..c5749405cc71a 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
@@ -89,6 +89,12 @@ class SystemZSubtarget : public SystemZGenSubtargetInfo {
   // "source" order scheduler.
   bool enableMachineScheduler() const override { return true; }
 
+  // Don't use pre-ra mischeduler for huge regions where it creates a lot of
+  // spilling (temporary solution).
+  bool disableForRegionPreRA(MachineBasicBlock::iterator Begin,
+                             MachineBasicBlock::iterator End,
+                             unsigned NumRegionInstrs) const override;
+
   // This is important for reducing register pressure in vector code.
   bool useAA() const override { return true; }
 

>From bae21d51a33c66672e3a67dc919a6b23fa266b09 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 26 Apr 2024 08:53:31 +0200
Subject: [PATCH 2/2] Some heuristics for OOO scheduling

---
 llvm/include/llvm/CodeGen/MachineScheduler.h  |  44 +-
 llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h |   4 +-
 .../llvm/CodeGen/TargetSubtargetInfo.h        |  10 -
 llvm/lib/CodeGen/MachineScheduler.cpp         | 411 +++++++++++++++++-
 llvm/lib/Target/SystemZ/SystemZSubtarget.cpp  |  16 -
 llvm/lib/Target/SystemZ/SystemZSubtarget.h    |   6 -
 6 files changed, 434 insertions(+), 57 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index fb22f7de0a562..6a0c886cbd78e 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -94,6 +94,7 @@
 #include <cassert>
 #include <llvm/Support/raw_ostream.h>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -241,12 +242,6 @@ class MachineSchedStrategy {
   /// Tell the strategy that MBB is about to be processed.
   virtual void enterMBB(MachineBasicBlock *MBB) {};
 
-  virtual bool disableForRegionPreRA(MachineBasicBlock::iterator begin,
-                                     MachineBasicBlock::iterator end,
-                                     unsigned regioninstrs) const {
-    return false;
-  }
-
   /// Tell the strategy that current MBB is done.
   virtual void leaveMBB() {};
 
@@ -496,9 +491,7 @@ class ScheduleDAGMILive : public ScheduleDAGMI {
   bool disableForRegion(MachineBasicBlock *bb,
                         MachineBasicBlock::iterator begin,
                         MachineBasicBlock::iterator end,
-                        unsigned regioninstrs) const override {
-    return SchedImpl->disableForRegionPreRA(begin, end, regioninstrs);
-  }
+                        unsigned regioninstrs) const override;
 
   /// Implement ScheduleDAGInstrs interface for scheduling a sequence of
   /// reorderable instructions.
@@ -1084,7 +1077,7 @@ class GenericSchedulerBase : public MachineSchedStrategy {
   enum CandReason : uint8_t {
     NoCand, Only1, PhysReg, RegExcess, RegCritical, Stall, Cluster, Weak,
     RegMax, ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
-    TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
+    TopDepthReduce, TopPathReduce, NextDefUse, RegPressure, NodeOrder};
 
 #ifndef NDEBUG
   static const char *getReasonStr(GenericSchedulerBase::CandReason Reason);
@@ -1221,6 +1214,33 @@ int biasPhysReg(const SUnit *SU, bool isTop);
 /// GenericScheduler shrinks the unscheduled zone using heuristics to balance
 /// the schedule.
 class GenericScheduler : public GenericSchedulerBase {
+  //// Experimental members for OOO scheduling. ////
+
+  // TODO: Integrate with SchedDFSResult class.
+  // SU -> Nodes above in subtree.
+  std::vector<std::set<const SUnit *> > TreeSUs;
+  // SU -> Virtual regs defined above in subtree.
+  std::vector<std::set<Register> > TreeDefs;
+  // SU -> Regs used but not defined above in subtree.
+  std::vector<std::set<Register> > TreeUses;
+
+  // If this SU is non-null, it is the start of a subtree to be scheduled as
+  // a unit.
+  mutable SUnit *NextSubtreeSU = nullptr;
+  // A (small) set of instructions to be scheduled next as a unit.
+  std::set<const SUnit *> NextQueue;
+
+  unsigned DAGHeight;
+  unsigned DAGDepth;
+  unsigned NumScheduled;
+  std::set<Register> LiveRegs; // Currently live registers.
+
+  void initLiveRegs(ScheduleDAGMILive *DAG);
+  void getMIPDiff(const MachineInstr *MI, PressureDiff &PDiff) const;
+  void getTreePDiff(unsigned NodeNum, PressureDiff &PDiff) const;
+  int comparePDiffs(PressureDiff &PDiff1, PressureDiff &PDiff2) const;
+  ////                                          ////
+
 public:
   GenericScheduler(const MachineSchedContext *C):
     GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ"),
@@ -1232,10 +1252,6 @@ class GenericScheduler : public GenericSchedulerBase {
 
   void dumpPolicy() const override;
 
-  bool disableForRegionPreRA(MachineBasicBlock::iterator Begin,
-                             MachineBasicBlock::iterator End,
-                             unsigned NumRegionInstrs) const override;
-
   bool shouldTrackPressure() const override {
     return RegionPolicy.ShouldTrackPressure;
   }
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 6594048b8f8a2..789948acedcab 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -322,7 +322,9 @@ namespace llvm {
     virtual bool disableForRegion(MachineBasicBlock *bb,
                                   MachineBasicBlock::iterator begin,
                                   MachineBasicBlock::iterator end,
-                                  unsigned regioninstrs) const { return false; }
+                                  unsigned regioninstrs) const {
+      return false;
+    }
 
     /// Called when the scheduler has finished scheduling the current region.
     virtual void exitRegion();
diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index 9d74a2ec57804..5023e29ce145a 100644
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -16,7 +16,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MacroFusion.h"
 #include "llvm/CodeGen/PBQPRAConstraint.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
@@ -230,15 +229,6 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
   virtual void overrideSchedPolicy(MachineSchedPolicy &Policy,
                                    unsigned NumRegionInstrs) const {}
 
-  /// Allow the subtarget to leave a region untouched. This has purposefully
-  /// been left a bit untangled from other methods as this is hopefully
-  /// just a temporary solution.
-  virtual bool disableForRegionPreRA(MachineBasicBlock::iterator Begin,
-                                     MachineBasicBlock::iterator End,
-                                     unsigned NumRegionInstrs) const {
-    return false;
-  }
-
   // Perform target-specific adjustments to the latency of a schedule
   // dependency.
   // If a pair of operands is associated with the schedule dependency, DefOpIdx
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 38a8a1de70c2e..f071dc40c3724 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1253,6 +1253,23 @@ void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb,
          "ShouldTrackLaneMasks requires ShouldTrackPressure");
 }
 
+// EXPERIMENTAL: It seems that GenericScheduler currently often increases
+// spilling heavily with huge regions (like >350 instructions). This option
+// makes any sched region bigger than its value have pre-ra scheduling
+// skipped.
+cl::opt<unsigned> NoSchedAbove("nosched-above", cl::init(~0U));
+bool ScheduleDAGMILive::disableForRegion(MachineBasicBlock *bb,
+                                         MachineBasicBlock::iterator begin,
+                                         MachineBasicBlock::iterator end,
+                                         unsigned regioninstrs) const {
+  if (NumRegionInstrs > NoSchedAbove) {
+    LLVM_DEBUG(dbgs() << "Disabling pre-ra mischeduling of region with "
+                      << NumRegionInstrs << " instructions\n";);
+    return true;
+  }
+  return false;
+}
+
 // Setup the register pressure trackers for the top scheduled and bottom
 // scheduled regions.
 void ScheduleDAGMILive::initRegPressure() {
@@ -2455,7 +2472,53 @@ SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx,
 /// can dispatch per cycle.
 ///
 /// TODO: Also check whether the SU must start a new group.
+
+// EXPERIMENTAL: General switch for experimental OOO heuristics.
+// Note: This will (currently) enforce bottom-up only scheduling for affected
+// regions.
+cl::opt<bool> OOOSCHED("misched-ooo", cl::init(false));
+
+// EXPERIMENTAL: Similar to NoSchedAbove. Disables OOO heuristics for smaller
+// regions below the limit.
+cl::opt<unsigned> NoOOOSchedBelow("no-ooosched-below", cl::init(0));
+
+// EXPERIMENTAL: Disable cycle hazards for OOO target.
+cl::opt<bool> NOHAZARDS("misched-nohazards", cl::init(true));
+
+// EXPERIMENTAL: Used for verification purposes (disabling rescheduling).
+cl::opt<bool> INPUTORDER("misched-inputorder", cl::init(false));
+
+// EXPERIMENTAL: Enable regpressure heuristics for OOO scheduling.
+cl::opt<bool> REGPRESS("misched-regpress", cl::init(true));
+
+// EXPERIMENTAL: Use DFSResult for regpressure heuristics with OOO scheduling.
+cl::opt<bool> DFS("misched-dfs", cl::init(true));
+
+// EXPERIMENTAL: The size limit of subtrees to schedule as a unit.
+cl::opt<unsigned> DFSSIZE("dfs-size", cl::init(4));
+
+// EXPERIMENTAL: Enable height heuristic for OOO scheduling.
+cl::opt<bool> HEIGHTHEUR("misched-heightheur", cl::init(true));
+
+// EXPERIMENTAL: Rough differentiator for height heuristic: If the DAG size /
+// height is greater/eq than this value, then it is too "wide" to enable the
+// height heuristic.
+cl::opt<unsigned> HEIGHTIFWFAC("misched-heightifwfac", cl::init(3));
+
+static bool doOOOSchedForRegion(unsigned NumRegionInstrs) {
+  return OOOSCHED && NumRegionInstrs > NoOOOSchedBelow;
+}
+
+static bool doHeightHeurForRegion(const ScheduleDAGMI *DAG, unsigned DAGHeight) {
+  return HEIGHTHEUR &&
+         DAGHeight != 0 && (DAG->SUnits.size() / DAGHeight) < HEIGHTIFWFAC;
+}
+
 bool SchedBoundary::checkHazard(SUnit *SU) {
+  // Better to make SU available and potentially reduce register pressure.
+  if ((doOOOSchedForRegion(DAG->SUnits.size()) && NOHAZARDS) || INPUTORDER)
+    return false;
+
   if (HazardRec->isEnabled()
       && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) {
     return true;
@@ -2692,9 +2755,9 @@ void SchedBoundary::bumpNode(SUnit *SU) {
   // exceed the issue width.
   const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
   unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr());
-  assert(
-      (CurrMOps == 0 || (CurrMOps + IncMOps) <= SchedModel->getIssueWidth()) &&
-      "Cannot schedule this instruction's MicroOps in the current cycle.");
+  assert((doOOOSchedForRegion(DAG->SUnits.size()) || INPUTORDER ||
+       (CurrMOps == 0 || (CurrMOps + IncMOps) <= SchedModel->getIssueWidth())) &&
+       "Cannot schedule this instruction's MicroOps in the current cycle.");
 
   unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
   LLVM_DEBUG(dbgs() << "  Ready @" << ReadyCycle << "c\n");
@@ -3109,6 +3172,7 @@ const char *GenericSchedulerBase::getReasonStr(
   case BotHeightReduce:return "BOT-HEIGHT";
   case BotPathReduce:  return "BOT-PATH  ";
   case NextDefUse:     return "DEF-USE   ";
+  case RegPressure:    return "REG-PRESS ";
   case NodeOrder:      return "ORDER     ";
   };
   llvm_unreachable("Unknown reason!");
@@ -3246,6 +3310,20 @@ static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) {
   tracePick(Cand.Reason, Cand.AtTop);
 }
 
+// Skip all operands that are not interesting registers from a scheduling
+// register pressure persepctive.
+bool skipOp(const MachineOperand &Op, const MachineRegisterInfo *MRI,
+            bool SkipPhysRegs = true) {
+  if (!Op.isReg() || !Op.getReg() || Op.isImplicit() ||
+      (Op.isUse() && Op.isUndef()) || Op.isDead())
+    return true;
+
+  if (Register::isPhysicalRegister(Op.getReg()))
+    return SkipPhysRegs ? true : !MRI->isAllocatable(Op.getReg());
+
+  return false;
+}
+
 void GenericScheduler::initialize(ScheduleDAGMI *dag) {
   assert(dag->hasVRegLiveness() &&
          "(PreRA)GenericScheduler needs vreg liveness");
@@ -3256,10 +3334,59 @@ void GenericScheduler::initialize(ScheduleDAGMI *dag) {
   if (RegionPolicy.ComputeDFSResult)
     DAG->computeDFSResult();
 
+  if (DFS) {
+    DAG->computeDFSResult();
+    const SchedDFSResult *DFSResult = DAG->getDFSResult();
+    TreeDefs  = std::vector<std::set<Register> >     (DAG->SUnits.size());
+    TreeUses  = std::vector<std::set<Register> >     (DAG->SUnits.size());
+    TreeSUs   = std::vector<std::set<const SUnit *> >(DAG->SUnits.size());
+    assert(NextSubtreeSU == nullptr);
+
+    for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
+      const SUnit *SU = &DAG->SUnits[Idx];
+      TreeSUs[Idx].insert(SU);
+      if (SU->getInstr()->getNumOperands()) {
+        const MachineOperand &MO = SU->getInstr()->getOperand(0);
+        if (!skipOp(MO, &DAG->MRI) && MO.isDef())
+          TreeDefs[Idx].insert(MO.getReg());
+      }
+
+      // Propagate from above in subtree.
+      unsigned SUTree = DFSResult->getSubtreeID(SU);
+      for (const SDep &Pred : SU->Preds) {
+        const SUnit *PredSU = Pred.getSUnit();
+        if (PredSU->isBoundaryNode())
+          continue;
+        unsigned PI = PredSU->NodeNum;
+        if (DFSResult->getSubtreeID(PredSU) == SUTree) {
+          TreeSUs[Idx].insert(TreeSUs[PI].begin(), TreeSUs[PI].end());
+          TreeDefs[Idx].insert(TreeDefs[PI].begin(), TreeDefs[PI].end());
+          TreeUses[Idx].insert(TreeUses[PI].begin(), TreeUses[PI].end());
+        }
+      }
+
+      // Virtual register uses not defined in subtree.  Avoid involvement
+      // with physregs.
+      for (auto &MO : SU->getInstr()->explicit_operands())
+        if (!skipOp(MO, &DAG->MRI, /*SkipPhysRegs=*/false) && MO.isUse() &&
+            !TreeDefs[Idx].count(MO.getReg()))
+          TreeUses[Idx].insert(MO.getReg());
+    }
+  }
+
   Rem.init(DAG, SchedModel);
   Top.init(DAG, SchedModel, &Rem);
   Bot.init(DAG, SchedModel, &Rem);
 
+  DAGHeight = 0;
+  DAGDepth = 0;
+  for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
+    DAGHeight = std::max(DAGHeight, DAG->SUnits[Idx].getHeight());
+    DAGDepth = std::max(DAGDepth, DAG->SUnits[Idx].getDepth());
+  }
+  NumScheduled = 0;
+  initLiveRegs(DAG);
+
   // Initialize resource counts.
 
   // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
@@ -3275,6 +3402,23 @@ void GenericScheduler::initialize(ScheduleDAGMI *dag) {
   BotCand.SU = nullptr;
 }
 
+void GenericScheduler::initLiveRegs(ScheduleDAGMILive *DAG) {
+  LiveRegs.clear();
+  // TODO: Is this slow...?
+  for (unsigned I = 0, E = DAG->MRI.getNumVirtRegs(); I != E; ++I) {
+    Register VirtReg = Register::index2VirtReg(I);
+    const LiveInterval &LI = DAG->getLIS()->getInterval(VirtReg);
+    LiveQueryResult LRQ = LI.Query(DAG->getLIS()->
+                            getInstructionIndex(*DAG->SUnits.back().getInstr()));
+    if (LRQ.valueOut())
+      LiveRegs.insert(VirtReg);
+  }
+  LLVM_DEBUG( dbgs() << " Live outs: ";
+              for (auto Reg : LiveRegs)
+                dbgs() << "%" << Register::virtReg2Index(Reg) << ", ";
+              dbgs() << "\n";);
+}
+
 /// Initialize the per-region scheduling policy.
 void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
                                   MachineBasicBlock::iterator End,
@@ -3324,6 +3468,11 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
     if (RegionPolicy.OnlyTopDown)
       RegionPolicy.OnlyBottomUp = false;
   }
+
+  if (doOOOSchedForRegion(NumRegionInstrs)) {
+    RegionPolicy.OnlyBottomUp = true;
+    RegionPolicy.OnlyTopDown = false;
+  }
 }
 
 void GenericScheduler::dumpPolicy() const {
@@ -3337,13 +3486,6 @@ void GenericScheduler::dumpPolicy() const {
 #endif
 }
 
-bool GenericScheduler::disableForRegionPreRA(MachineBasicBlock::iterator Begin,
-                                             MachineBasicBlock::iterator End,
-                                             unsigned NumRegionInstrs) const {
-  const MachineFunction &MF = *Begin->getMF();
-  return MF.getSubtarget().disableForRegionPreRA(Begin, End, NumRegionInstrs);
-}
-
 /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
 /// critical path by more cycles than it takes to drain the instruction buffer.
 /// We estimate an upper bounds on in-flight instructions as:
@@ -3486,6 +3628,76 @@ int biasPhysReg(const SUnit *SU, bool isTop) {
 
   return 0;
 }
+
+// Compute the current PressureDiff for MI and return it in PDiff.
+void GenericScheduler::getMIPDiff(const MachineInstr *MI, PressureDiff &PDiff) const {
+  std::set<Register> Kills, Defs;
+
+  for (auto &MO : MI->explicit_operands()) {
+    if (skipOp(MO, &DAG->MRI))
+      continue;
+    const LiveInterval &LI = DAG->getLIS()->getInterval(MO.getReg());
+    LiveQueryResult LRQ = LI.Query(DAG->getLIS()->getInstructionIndex(*MI));
+    if (MO.isUse() && !LiveRegs.count(MO.getReg()))
+      Kills.insert(MO.getReg());
+    else if (MO.isDef() && LRQ.valueOut() != nullptr && LRQ.valueIn() == nullptr)
+      Defs.insert(MO.getReg());
+  }
+
+  for (auto &Kill : Kills)
+    PDiff.addPressureChange(Kill, false/*IsDec*/, &DAG->MRI);
+  for (auto &Def : Defs)
+    PDiff.addPressureChange(Def, true/*IsDec*/, &DAG->MRI);
+}
+
+// Compute the current PressureDiff for a subtree beginning at the NodeNum SU.
+void GenericScheduler::getTreePDiff(unsigned NodeNum,
+                                    PressureDiff &PDiff) const {
+  // Only consider relatively small subtrees.
+  if (TreeSUs[NodeNum].size() <= 1 || TreeSUs[NodeNum].size() > DFSSIZE)
+    return;
+
+  // Don't schedule a subtree if it would cause a register to become live.
+  for (auto &Reg : TreeUses[NodeNum])
+    if (!LiveRegs.count(Reg))
+      return;
+
+  // Check that this is a subtree that can be scheduled as a unit.
+  for (auto *TreeSU : TreeSUs[NodeNum])
+    for (const SDep &Succ : TreeSU->Succs)
+      if (!Succ.getSUnit()->isScheduled &&
+          !TreeSUs[NodeNum].count(Succ.getSUnit()))
+        return;
+
+  // Return the PressureDiff counting the currently live registers that this
+  // subtree defines (and are not redefining).
+  for (auto R : TreeDefs[NodeNum])
+    if (LiveRegs.count(R) && !TreeUses[NodeNum].count(R))
+      PDiff.addPressureChange(R, true/*IsDec*/, &DAG->MRI);
+}
+
+// Compare two pressure diffs and retun a non-zero value only in cases where
+// one is increasing while the other is decreasing the same pressure set. The
+// returned value will reflect PDiff2 as in being negative if it decreases
+// pressure.
+int GenericScheduler::comparePDiffs(PressureDiff &PDiff1,
+                                    PressureDiff &PDiff2) const {
+  int RPScore = 0;
+  for (const PressureChange &PC1 : PDiff1) {
+    if (!PC1.isValid())
+      break;
+    for (const PressureChange &PC2 : PDiff2) {
+      if (!PC2.isValid())
+        break;
+      if (PC1.getPSet() == PC2.getPSet() &&
+          (PC2.getUnitInc() < 0) != (PC1.getUnitInc() < 0)) {
+        RPScore += PC2.getUnitInc() < 0 ? -1 : 1;
+        break;
+      }
+    }
+  }
+  return RPScore;
+}
 } // end namespace llvm
 
 void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
@@ -3525,6 +3737,18 @@ void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
              << Cand.RPDelta.Excess.getUnitInc() << "\n");
 }
 
+#ifndef NDEBUG
+// Dump PressureDiffs for SU and Subtree.
+void dumpPDiffs_SU_STree(unsigned SUNodeNum, unsigned SubtreeNodeNum,
+                         PressureDiff &SUPDiff, PressureDiff &SubtreePDiff,
+                         const TargetRegisterInfo *TRI) {
+  dbgs() << "SU(" << SUNodeNum << ") PDiff: \t";
+  SUPDiff.dump(*TRI);
+  dbgs() << "Subtree starting with SU(" << SubtreeNodeNum << ") PDiff: \t";
+  SubtreePDiff.dump(*TRI);
+}
+#endif
+
 /// Apply a set of heuristics to a new candidate. Heuristics are currently
 /// hierarchical. This may be more efficient than a graduated cost model because
 /// we don't need to evaluate all aspects of the model for each node in the
@@ -3545,11 +3769,120 @@ bool GenericScheduler::tryCandidate(SchedCandidate &Cand,
     return true;
   }
 
+  // Experimental: produce the same output order as in the input.
+  if (INPUTORDER) {
+    if (TryCand.SU->NodeNum > Cand.SU->NodeNum) {
+      TryCand.Reason = NodeOrder;
+      return true;
+    }
+    return false;
+  }
+
   // Bias PhysReg Defs and copies to their uses and defined respectively.
   if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
                  biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
     return TryCand.Reason != NoCand;
 
+  // Experimental scheduling for OOO target.
+  if (doOOOSchedForRegion(DAG->SUnits.size())) {
+
+    bool SkipPhysRegs = biasPhysReg(TryCand.SU, TryCand.AtTop) &&
+      biasPhysReg(Cand.SU, TryCand.AtTop);
+
+    if (REGPRESS && !SkipPhysRegs) {
+      // Schedule from NextQueue until it's empty.
+      if (DFS && !NextQueue.empty() &&
+          tryGreater(NextQueue.count(TryCand.SU), NextQueue.count(Cand.SU),
+                     TryCand, Cand, GenericSchedulerBase::RegPressure))
+        return TryCand.Reason != NoCand;
+
+      // Compare pressure diffs between the candidates and schedule an
+      // instruction that decreases register pressure such as an immediate
+      // load below the other.
+      PressureDiff CandMIPDiff;
+      PressureDiff TryCandMIPDiff;
+      getMIPDiff(Cand.SU->getInstr(), CandMIPDiff);
+      getMIPDiff(TryCand.SU->getInstr(), TryCandMIPDiff);
+      if (int TryCandRPScore = comparePDiffs(CandMIPDiff, TryCandMIPDiff)) {
+        LLVM_DEBUG(dbgs() << "SU(" << Cand.SU->NodeNum << ") PDiff: \t";
+                   CandMIPDiff.dump(*TRI);
+                   dbgs() << "SU(" << TryCand.SU->NodeNum << ") PDiff: \t";
+                   TryCandMIPDiff.dump(*TRI););
+        tryLess(TryCandRPScore, 0, TryCand, Cand, GenericSchedulerBase::RegPressure);
+        return TryCand.Reason != NoCand;
+      }
+
+      // See if there is a subtree that would reduce register pressure if
+      // scheduled.
+      if (DFS && NextQueue.empty()) {
+        PressureDiff TryCandTreePDiff;
+        getTreePDiff(TryCand.SU->NodeNum, TryCandTreePDiff);
+        if (comparePDiffs(CandMIPDiff, TryCandTreePDiff) < 0) {
+          LLVM_DEBUG(dumpPDiffs_SU_STree(Cand.SU->NodeNum, TryCand.SU->NodeNum,
+                                         CandMIPDiff, TryCandTreePDiff, TRI););
+          TryCand.Reason = GenericSchedulerBase::RegPressure;
+          NextSubtreeSU = TryCand.SU;
+          return true;
+        }
+        PressureDiff CandTreePDiff;
+        getTreePDiff(Cand.SU->NodeNum, CandTreePDiff);
+        if (comparePDiffs(TryCandMIPDiff, CandTreePDiff) < 0) {
+          LLVM_DEBUG(dumpPDiffs_SU_STree(TryCand.SU->NodeNum, Cand.SU->NodeNum,
+                                         TryCandMIPDiff, CandTreePDiff, TRI););
+          Cand.Reason = GenericSchedulerBase::RegPressure;
+          NextSubtreeSU = Cand.SU;
+          return false;
+        }
+      }
+
+      // An SU that only increments register pressure (bottom-up) would help
+      // register pressure if scheduled higher (e.g. a store). Don't push all
+      // it's predecessor further up, but at least make sure that the SU is
+      // scheduled immediately after its predecessor in the input order.
+      auto onlyIncreases = [&](PressureDiff &PDiff) -> bool {
+        bool Incr = false;
+        bool Decr = false;
+        for (const PressureChange &PC : PDiff) {
+          if (!PC.isValid())
+            break;
+          (PC.getUnitInc() > 0 ? Incr : Decr) = true;
+        }
+        return Incr && !Decr;
+      };
+      auto maxPredNum = [&](const SUnit *SU) -> unsigned {
+        unsigned MaxPredNodeNum = 0;
+        for (const SDep &Pred : SU->Preds)
+          if (Pred.getSUnit() != &DAG->EntrySU &&
+              Pred.getSUnit()->NodeNum > MaxPredNodeNum)
+            MaxPredNodeNum = Pred.getSUnit()->NodeNum;
+        return MaxPredNodeNum;
+      };
+      bool TryCandIncr = onlyIncreases(TryCandMIPDiff);
+      bool CandIncr    = onlyIncreases(CandMIPDiff);
+      if (TryCandIncr != CandIncr) {
+        bool TryCandHeur = (TryCandIncr &&
+                            maxPredNum(TryCand.SU) < Cand.SU->NodeNum);
+        bool CandHeur = (CandIncr &&
+                         maxPredNum(Cand.SU) < TryCand.SU->NodeNum);
+        if (tryLess(TryCandHeur, CandHeur, TryCand, Cand,
+                    GenericSchedulerBase::RegPressure))
+          return TryCand.Reason != NoCand;
+      }
+    }
+
+    if (!SkipPhysRegs && doHeightHeurForRegion(DAG, DAGHeight) &&
+        tryLatency(TryCand, Cand, *Zone))
+        return TryCand.Reason != NoCand;
+
+    // Fall through to original instruction order.
+    if (TryCand.SU->NodeNum > Cand.SU->NodeNum) {
+      TryCand.Reason = NodeOrder;
+      return true;
+    }
+
+    return false;
+  }
+
   // Avoid exceeding the target's limit.
   if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
                                                Cand.RPDelta.Excess,
@@ -3654,6 +3987,11 @@ void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
   // getMaxPressureDelta temporarily modifies the tracker.
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
 
+  LLVM_DEBUG( dbgs() << "Live regs: ";
+              for (auto R : LiveRegs)
+                dbgs() << "%" << Register::virtReg2Index(R) << ", ";
+              dbgs() << "\n";);
+
   ReadyQueue &Q = Zone.Available;
   for (SUnit *SU : Q) {
 
@@ -3853,6 +4191,59 @@ void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
     if (SU->hasPhysRegDefs)
       reschedulePhysReg(SU, false);
   }
+
+  MachineInstr *MI = SU->getInstr();
+  for (auto &MO : MI->explicit_operands())
+    if (!skipOp(MO, &DAG->MRI)) {
+      if (MO.isDef()) {
+        assert(LiveRegs.count(MO.getReg()) || MO.isDead());
+        if (!MO.getSubReg()) {
+          LiveRegs.erase(MO.getReg());
+        } else {
+          const LiveInterval &LI = DAG->getLIS()->getInterval(MO.getReg());
+          LiveQueryResult LRQ =
+            LI.Query(DAG->getLIS()->getInstructionIndex(*MI));
+          if(!LRQ.valueIn())
+            LiveRegs.erase(MO.getReg());
+        }
+      }
+      else if (MO.readsReg())
+        LiveRegs.insert(MO.getReg());
+    }
+  ++NumScheduled;
+
+  if (NextSubtreeSU) {
+    assert(NextQueue.empty());
+    if (NextSubtreeSU == SU) {
+      for (auto *TSU : TreeSUs[SU->NodeNum])
+        if (!TSU->isScheduled)
+          NextQueue.insert(TSU);
+      LLVM_DEBUG(dbgs() << "Scheduling subtree: ";
+                 for (auto *NxSU : NextQueue)
+                   dbgs() << NxSU->NodeNum << " ";
+                 dbgs() << "\n";);
+    }
+    NextSubtreeSU = nullptr;
+  }
+
+  if (!NextQueue.empty()) {
+    assert (NextQueue.count(SU) && "Failed to schedule planned SU.");
+    NextQueue.erase(SU);
+#ifndef NDEBUG
+    const SchedDFSResult *DFSResult = DAG->getDFSResult();
+    unsigned SUTree = DFSResult->getSubtreeID(SU);
+    for (const SDep &Pred : SU->Preds) {
+      const SUnit *PredSU = Pred.getSUnit();
+      assert((PredSU->isBoundaryNode() ||
+              Pred.getKind() != SDep::Data ||
+              (DFSResult->getSubtreeID(PredSU) == SUTree &&
+               NextQueue.count(PredSU)) ||
+              LiveRegs.count(Pred.getReg()) ||
+              Register::isPhysicalRegister(Pred.getReg())) &&
+             "Expected no data edges exiting the subtree.");
+    }
+#endif
+  }
 }
 
 /// Create the standard converging machine scheduler. This will be used as the
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index e09ca747e4fe2..d0badd3692e40 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -72,22 +72,6 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
       InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
       TLInfo(TM, *this), FrameLowering(SystemZFrameLowering::create(*this)) {}
 
-
-// EXPERIMENTAL
-cl::opt<unsigned> NoSchedAbove("nosched-above", cl::init(~0U));
-bool SystemZSubtarget::disableForRegionPreRA(MachineBasicBlock::iterator Begin,
-                                             MachineBasicBlock::iterator End,
-                                             unsigned NumRegionInstrs) const {
-  // It seems that the generic scheduler currently can increase spilling heavily
-  // with big / huge regions. Disable it until it is fixed.
-  if (NumRegionInstrs > NoSchedAbove) {
-    LLVM_DEBUG(dbgs() << "Disabling pre-ra mischeduling of region with "
-                      << NumRegionInstrs << " instructions\n";);
-    return true;
-  }
-  return false;
-}
-
 bool SystemZSubtarget::enableSubRegLiveness() const {
   return UseSubRegLiveness;
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
index c5749405cc71a..5fa7c8f194ebf 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
@@ -89,12 +89,6 @@ class SystemZSubtarget : public SystemZGenSubtargetInfo {
   // "source" order scheduler.
   bool enableMachineScheduler() const override { return true; }
 
-  // Don't use pre-ra mischeduler for huge regions where it creates a lot of
-  // spilling (temporary solution).
-  bool disableForRegionPreRA(MachineBasicBlock::iterator Begin,
-                             MachineBasicBlock::iterator End,
-                             unsigned NumRegionInstrs) const override;
-
   // This is important for reducing register pressure in vector code.
   bool useAA() const override { return true; }
 



More information about the llvm-commits mailing list