[llvm] 7ca9e47 - [AMDGPU] Start refactoring GCNSchedStrategy

Tue Jul 26 08:55:28 PDT 2022

Author: Austin Kerbow
Date: 2022-07-26T08:55:19-07:00
New Revision: 7ca9e471fe5b5ec51d151774e52dd0d5bd8f0ad0

URL: https://github.com/llvm/llvm-project/commit/7ca9e471fe5b5ec51d151774e52dd0d5bd8f0ad0
DIFF: https://github.com/llvm/llvm-project/commit/7ca9e471fe5b5ec51d151774e52dd0d5bd8f0ad0.diff

LOG: [AMDGPU] Start refactoring GCNSchedStrategy

Tries to make the different scheduling stages a bit more self contained and
modifiable. Intended to be NFC. Preface to other changes.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D130147

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
    llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 04da14cc4916..859deae86f35 100644

--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -9,6 +9,18 @@
 /// \file
 /// This contains a MachineSchedStrategy implementation for maximizing wave
 /// occupancy on GCN hardware.
+///
+/// This pass will apply multiple scheduling stages to the same function.
+/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
+/// entry point for the scheduling of those regions is
+/// GCNScheduleDAGMILive::runSchedStages.
+
+/// Generally, the reason for having multiple scheduling stages is to account
+/// for the kernel-wide effect of register usage on occupancy.  Usually, only a
+/// few scheduling regions will have register pressure high enough to limit
+/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
+/// other regions.
+///
 //===----------------------------------------------------------------------===//
 
 #include "GCNSchedStrategy.h"
@@ -20,9 +32,9 @@
 using namespace llvm;
 
 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
-    const MachineSchedContext *C) :
-    GenericScheduler(C), TargetOccupancy(0), HasClusteredNodes(false),
-    HasExcessPressure(false), MF(nullptr) { }
+    const MachineSchedContext *C)
+    : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
+      HasClusteredNodes(false), HasExcessPressure(false) {}
 
 void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
   GenericScheduler::initialize(DAG);
@@ -302,210 +314,30 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
   return SU;
 }
 
-GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
-                        std::unique_ptr<MachineSchedStrategy> S) :
-  ScheduleDAGMILive(C, std::move(S)),
-  ST(MF.getSubtarget<GCNSubtarget>()),
-  MFI(*MF.getInfo<SIMachineFunctionInfo>()),
-  StartingOccupancy(MFI.getOccupancy()),
-  MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) {
+GCNScheduleDAGMILive::GCNScheduleDAGMILive(
+    MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
+    : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
+      MFI(*MF.getInfo<SIMachineFunctionInfo>()),
+      StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) {
 
   LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
 }
 
 void GCNScheduleDAGMILive::schedule() {
-  if (Stage == Collect) {
-    // Just record regions at the first pass.
-    Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
-    return;
-  }
-
-  std::vector<MachineInstr*> Unsched;
-  Unsched.reserve(NumRegionInstrs);
-  for (auto &I : *this) {
-    Unsched.push_back(&I);
-  }
-
-  GCNRegPressure PressureBefore;
-  if (LIS) {
-    PressureBefore = Pressure[RegionIdx];
-
-    LLVM_DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
-               GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
-               dbgs() << "Region live-in pressure:  ";
-               llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
-               dbgs() << "Region register pressure: ";
-               PressureBefore.print(dbgs()));
-  }
-
-  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
-  // Set HasClusteredNodes to true for late stages where we have already
-  // collected it. That way pickNode() will not scan SDep's when not needed.
-  S.HasClusteredNodes = Stage > InitialSchedule;
-  S.HasExcessPressure = false;
-  ScheduleDAGMILive::schedule();
-  Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
-  RescheduleRegions[RegionIdx] = false;
-  if (Stage == InitialSchedule && S.HasClusteredNodes)
-    RegionsWithClusters[RegionIdx] = true;
-  if (S.HasExcessPressure)
-    RegionsWithHighRP[RegionIdx] = true;
-
-  if (!LIS)
-    return;
-
-  // Check the results of scheduling.
-  auto PressureAfter = getRealRegPressure();
-
-  LLVM_DEBUG(dbgs() << "Pressure after scheduling: ";
-             PressureAfter.print(dbgs()));
-
-  if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
-      PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
-    Pressure[RegionIdx] = PressureAfter;
-    RegionsWithMinOcc[RegionIdx] =
-        PressureAfter.getOccupancy(ST) == MinOccupancy;
-
-    LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
-    return;
-  }
-
-  unsigned WavesAfter =
-      std::min(S.TargetOccupancy, PressureAfter.getOccupancy(ST));
-  unsigned WavesBefore =
-      std::min(S.TargetOccupancy, PressureBefore.getOccupancy(ST));
-  LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
-                    << ", after " << WavesAfter << ".\n");
-
-  // We may not be able to keep the current target occupancy because of the just
-  // scheduled region. We might still be able to revert scheduling if the
-  // occupancy before was higher, or if the current schedule has register
-  // pressure higher than the excess limits which could lead to more spilling.
-  unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
-
-  // Allow memory bound functions to drop to 4 waves if not limited by an
-  // attribute.
-  if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy &&
-      WavesAfter >= MFI.getMinAllowedOccupancy()) {
-    LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
-                      << MFI.getMinAllowedOccupancy() << " waves\n");
-    NewOccupancy = WavesAfter;
-  }
-
-  if (NewOccupancy < MinOccupancy) {
-    MinOccupancy = NewOccupancy;
-    MFI.limitOccupancy(MinOccupancy);
-    RegionsWithMinOcc.reset();
-    LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
-                      << MinOccupancy << ".\n");
-  }
-
-  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
-  unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
-  if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
-      PressureAfter.getAGPRNum() > MaxVGPRs ||
-      PressureAfter.getSGPRNum() > MaxSGPRs) {
-    RescheduleRegions[RegionIdx] = true;
-    RegionsWithHighRP[RegionIdx] = true;
-  }
-
-  // If this condition is true, then either the occupancy before and after
-  // scheduling is the same, or we are allowing the occupancy to drop because
-  // the function is memory bound. Even if we are OK with the current occupancy,
-  // we still need to verify that we will not introduce any extra chance of
-  // spilling.
-  if (WavesAfter >= MinOccupancy) {
-    if (Stage == UnclusteredReschedule &&
-        !PressureAfter.less(ST, PressureBefore)) {
-      LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
-    } else if (WavesAfter > MFI.getMinWavesPerEU() ||
-        PressureAfter.less(ST, PressureBefore) ||
-        !RescheduleRegions[RegionIdx]) {
-      Pressure[RegionIdx] = PressureAfter;
-      RegionsWithMinOcc[RegionIdx] =
-          PressureAfter.getOccupancy(ST) == MinOccupancy;
-      if (!RegionsWithClusters[RegionIdx] &&
-          (Stage + 1) == UnclusteredReschedule)
-        RescheduleRegions[RegionIdx] = false;
-      return;
-    } else {
-      LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
-    }
-  }
-
-  RegionsWithMinOcc[RegionIdx] =
-      PressureBefore.getOccupancy(ST) == MinOccupancy;
-  LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
-  RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] ||
-                                 (Stage + 1) != UnclusteredReschedule;
-  RegionEnd = RegionBegin;
-  int SkippedDebugInstr = 0;
-  for (MachineInstr *MI : Unsched) {
-    if (MI->isDebugInstr()) {
-      ++SkippedDebugInstr;
-      continue;
-    }
-
-    if (MI->getIterator() != RegionEnd) {
-      BB->remove(MI);
-      BB->insert(RegionEnd, MI);
-      if (!MI->isDebugInstr())
-        LIS->handleMove(*MI, true);
-    }
-    // Reset read-undef flags and update them later.
-    for (auto &Op : MI->operands())
-      if (Op.isReg() && Op.isDef())
-        Op.setIsUndef(false);
-    RegisterOperands RegOpers;
-    RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
-    if (!MI->isDebugInstr()) {
-      if (ShouldTrackLaneMasks) {
-        // Adjust liveness and add missing dead+read-undef flags.
-        SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
-        RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
-      } else {
-        // Adjust for missing dead-def flags.
-        RegOpers.detectDeadDefs(*MI, *LIS);
-      }
-    }
-    RegionEnd = MI->getIterator();
-    ++RegionEnd;
-    LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
-  }
-
-  // After reverting schedule, debug instrs will now be at the end of the block
-  // and RegionEnd will point to the first debug instr. Increment RegionEnd
-  // pass debug instrs to the actual end of the scheduling region.
-  while (SkippedDebugInstr-- > 0)
-    ++RegionEnd;
-
-  // If Unsched.front() instruction is a debug instruction, this will actually
-  // shrink the region since we moved all debug instructions to the end of the
-  // block. Find the first instruction that is not a debug instruction.
-  RegionBegin = Unsched.front()->getIterator();
-  if (RegionBegin->isDebugInstr()) {
-    for (MachineInstr *MI : Unsched) {
-      if (MI->isDebugInstr())
-        continue;
-      RegionBegin = MI->getIterator();
-      break;
-    }
-  }
-
-  // Then move the debug instructions back into their correct place and set
-  // RegionBegin and RegionEnd if needed.
-  placeDebugValues();
-
-  Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
+  // Collect all scheduling regions. The actual scheduling is performed in
+  // GCNScheduleDAGMILive::finalizeSchedule.
+  Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
 }
 
-GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const {
+GCNRegPressure
+GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
   GCNDownwardRPTracker RPTracker(*LIS);
   RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
   return RPTracker.moveMaxPressure();
 }
 
-void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
+void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
+                                                const MachineBasicBlock *MBB) {
   GCNDownwardRPTracker RPTracker(*LIS);
 
   // If the block has the only successor then live-ins of that successor are
@@ -542,7 +374,7 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
     RPTracker.reset(*I, &LRS);
   }
 
-  for ( ; ; ) {
+  for (;;) {
     I = RPTracker.getNext();
 
     if (Regions[CurRegion].first == I || NonDbgMI == I) {
@@ -588,8 +420,9 @@ GCNScheduleDAGMILive::getBBLiveInMap() const {
 }
 
 void GCNScheduleDAGMILive::finalizeSchedule() {
-  LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
-
+  // Start actual scheduling here. This function is called by the base
+  // MachineScheduler after all regions have been recorded by
+  // GCNScheduleDAGMILive::schedule().
   LiveIns.resize(Regions.size());
   Pressure.resize(Regions.size());
   RescheduleRegions.resize(Regions.size());
@@ -601,142 +434,470 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
   RegionsWithHighRP.reset();
   RegionsWithMinOcc.reset();
 
+  runSchedStages();
+}
+
+void GCNScheduleDAGMILive::runSchedStages() {
+  LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
+  InitialScheduleStage S0(GCNSchedStageID::InitialSchedule, *this);
+  UnclusteredRescheduleStage S1(GCNSchedStageID::UnclusteredReschedule, *this);
+  ClusteredLowOccStage S2(GCNSchedStageID::ClusteredLowOccupancyReschedule,
+                          *this);
+  PreRARematStage S3(GCNSchedStageID::PreRARematerialize, *this);
+  GCNSchedStage *SchedStages[] = {&S0, &S1, &S2, &S3};
+
   if (!Regions.empty())
     BBLiveInMap = getBBLiveInMap();
 
-  std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
+  for (auto *Stage : SchedStages) {
+    if (!Stage->initGCNSchedStage())
+      continue;
 
-  do {
-    Stage++;
-    RegionIdx = 0;
-    MachineBasicBlock *MBB = nullptr;
+    for (auto Region : Regions) {
+      RegionBegin = Region.first;
+      RegionEnd = Region.second;
+      // Setup for scheduling the region and check whether it should be skipped.
+      if (!Stage->initGCNRegion()) {
+        Stage->advanceRegion();
+        exitRegion();
+        continue;
+      }
 
-    if (Stage > InitialSchedule) {
-      if (!LIS)
-        break;
+      ScheduleDAGMILive::schedule();
+      Stage->finalizeGCNRegion();
+    }
 
-      // Retry function scheduling if we found resulting occupancy and it is
-      // lower than used for first pass scheduling. This will give more freedom
-      // to schedule low register pressure blocks.
-      // Code is partially copied from MachineSchedulerBase::scheduleRegions().
+    Stage->finalizeGCNSchedStage();
+  }
+}
 
-      if (Stage == UnclusteredReschedule) {
-        if (RescheduleRegions.none())
-          continue;
-        LLVM_DEBUG(dbgs() <<
-          "Retrying function scheduling without clustering.\n");
-      }
+#ifndef NDEBUG
+raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
+  switch (StageID) {
+  case GCNSchedStageID::InitialSchedule:
+    OS << "Initial Schedule";
+    break;
+  case GCNSchedStageID::UnclusteredReschedule:
+    OS << "Unclustered Reschedule";
+    break;
+  case GCNSchedStageID::ClusteredLowOccupancyReschedule:
+    OS << "Clustered Low Occupancy Reschedule";
+    break;
+  case GCNSchedStageID::PreRARematerialize:
+    OS << "Pre-RA Rematerialize";
+    break;
+  }
+  return OS;
+}
+#endif
 
-      if (Stage == ClusteredLowOccupancyReschedule) {
-        if (StartingOccupancy <= MinOccupancy)
-          break;
+GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+    : DAG(DAG), S(static_cast<GCNMaxOccupancySchedStrategy &>(*DAG.SchedImpl)),
+      MF(DAG.MF), MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
 
-        LLVM_DEBUG(
-            dbgs()
-            << "Retrying function scheduling with lowest recorded occupancy "
-            << MinOccupancy << ".\n");
-      }
+bool GCNSchedStage::initGCNSchedStage() {
+  if (!DAG.LIS)
+    return false;
 
-      if (Stage == PreRARematerialize) {
-        if (RegionsWithMinOcc.none() || Regions.size() == 1)
-          break;
-
-        const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-        const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-        // Check maximum occupancy
-        if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
-            MinOccupancy)
-          break;
-
-        // FIXME: This pass will invalidate cached MBBLiveIns for regions
-        // inbetween the defs and region we sinked the def to. Cached pressure
-        // for regions where a def is sinked from will also be invalidated. Will
-        // need to be fixed if there is another pass after this pass.
-        static_assert(LastStage == PreRARematerialize,
-                      "Passes after PreRARematerialize are not supported");
-
-        collectRematerializableInstructions();
-        if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
-          break;
-
-        LLVM_DEBUG(
-            dbgs() << "Retrying function scheduling with improved occupancy of "
-                   << MinOccupancy << " from rematerializing\n");
-      }
-    }
+  LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
+  return true;
+}
 
-    if (Stage == UnclusteredReschedule)
-      SavedMutations.swap(Mutations);
+bool UnclusteredRescheduleStage::initGCNSchedStage() {
+  if (!GCNSchedStage::initGCNSchedStage())
+    return false;
 
-    for (auto Region : Regions) {
-      if (((Stage == UnclusteredReschedule || Stage == PreRARematerialize) &&
-           !RescheduleRegions[RegionIdx]) ||
-          (Stage == ClusteredLowOccupancyReschedule &&
-           !RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) {
+  if (DAG.RescheduleRegions.none())
+    return false;
 
-        ++RegionIdx;
-        continue;
-      }
+  SavedMutations.swap(DAG.Mutations);
 
-      RegionBegin = Region.first;
-      RegionEnd = Region.second;
+  LLVM_DEBUG(dbgs() << "Retrying function scheduling without clustering.\n");
+  return true;
+}
 
-      if (RegionBegin->getParent() != MBB) {
-        if (MBB) finishBlock();
-        MBB = RegionBegin->getParent();
-        startBlock(MBB);
-        if (Stage == InitialSchedule)
-          computeBlockPressure(MBB);
-      }
+bool ClusteredLowOccStage::initGCNSchedStage() {
+  if (!GCNSchedStage::initGCNSchedStage())
+    return false;
 
-      unsigned NumRegionInstrs = std::distance(begin(), end());
-      enterRegion(MBB, begin(), end(), NumRegionInstrs);
+  // Don't bother trying to improve ILP in lower RP regions if occupancy has not
+  // been dropped. All regions will have already been scheduled with the ideal
+  // occupancy targets.
+  if (DAG.StartingOccupancy <= DAG.MinOccupancy)
+    return false;
 
-      // Skip empty scheduling regions (0 or 1 schedulable instructions).
-      if (begin() == end() || begin() == std::prev(end())) {
-        exitRegion();
-        ++RegionIdx;
-        continue;
-      }
+  LLVM_DEBUG(
+      dbgs() << "Retrying function scheduling with lowest recorded occupancy "
+             << DAG.MinOccupancy << ".\n");
+  return true;
+}
 
-      LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
-      LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " "
-                        << MBB->getName() << "\n  From: " << *begin()
-                        << "    To: ";
-                 if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
-                 else dbgs() << "End";
-                 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+bool PreRARematStage::initGCNSchedStage() {
+  if (!GCNSchedStage::initGCNSchedStage())
+    return false;
+
+  if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
+    return false;
 
-      schedule();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  // Check maximum occupancy
+  if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
+      DAG.MinOccupancy)
+    return false;
+
+  // FIXME: This pass will invalidate cached MBBLiveIns for regions
+  // inbetween the defs and region we sinked the def to. Cached pressure
+  // for regions where a def is sinked from will also be invalidated. Will
+  // need to be fixed if there is another pass after this pass.
+
+  collectRematerializableInstructions();
+  if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
+    return false;
 
-      exitRegion();
-      ++RegionIdx;
+  LLVM_DEBUG(
+      dbgs() << "Retrying function scheduling with improved occupancy of "
+             << DAG.MinOccupancy << " from rematerializing\n");
+  return true;
+}
+
+void GCNSchedStage::finalizeGCNSchedStage() {
+  DAG.finishBlock();
+  LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
+}
+
+void UnclusteredRescheduleStage::finalizeGCNSchedStage() {
+  SavedMutations.swap(DAG.Mutations);
+
+  GCNSchedStage::finalizeGCNSchedStage();
+}
+
+bool GCNSchedStage::initGCNRegion() {
+  // Check whether this new region is also a new block.
+  if (DAG.RegionBegin->getParent() != CurrentMBB)
+    setupNewBlock();
+
+  unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
+  DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
+
+  // Skip empty scheduling regions (0 or 1 schedulable instructions).
+  if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
+  LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
+                    << " " << CurrentMBB->getName()
+                    << "\n  From: " << *DAG.begin() << "    To: ";
+             if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
+             else dbgs() << "End";
+             dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+
+  // Save original instruction order before scheduling for possible revert.
+  Unsched.clear();
+  Unsched.reserve(DAG.NumRegionInstrs);
+  for (auto &I : DAG)
+    Unsched.push_back(&I);
+
+  PressureBefore = DAG.Pressure[RegionIdx];
+
+  LLVM_DEBUG(
+      dbgs() << "Pressure before scheduling:\nRegion live-ins:";
+      GCNRPTracker::printLiveRegs(dbgs(), DAG.LiveIns[RegionIdx], DAG.MRI);
+      dbgs() << "Region live-in pressure:  ";
+      llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]).print(dbgs());
+      dbgs() << "Region register pressure: "; PressureBefore.print(dbgs()));
+
+  // Set HasClusteredNodes to true for late stages where we have already
+  // collected it. That way pickNode() will not scan SDep's when not needed.
+  S.HasClusteredNodes = StageID > GCNSchedStageID::InitialSchedule;
+  S.HasExcessPressure = false;
+
+  return true;
+}
+
+bool UnclusteredRescheduleStage::initGCNRegion() {
+  if (!DAG.RescheduleRegions[RegionIdx])
+    return false;
+
+  return GCNSchedStage::initGCNRegion();
+}
+
+bool ClusteredLowOccStage::initGCNRegion() {
+  // We may need to reschedule this region if it doesn't have clusters so it
+  // wasn't rescheduled in the last stage, or if we found it was testing
+  // critical register pressure limits in the unclustered reschedule stage. The
+  // later is because we may not have been able to raise the min occupancy in
+  // the previous stage so the region may be overly constrained even if it was
+  // already rescheduled.
+  if (!DAG.RegionsWithClusters[RegionIdx] && !DAG.RegionsWithHighRP[RegionIdx])
+    return false;
+
+  return GCNSchedStage::initGCNRegion();
+}
+
+bool PreRARematStage::initGCNRegion() {
+  if (!DAG.RescheduleRegions[RegionIdx])
+    return false;
+
+  return GCNSchedStage::initGCNRegion();
+}
+
+void GCNSchedStage::setupNewBlock() {
+  if (CurrentMBB)
+    DAG.finishBlock();
+
+  CurrentMBB = DAG.RegionBegin->getParent();
+  DAG.startBlock(CurrentMBB);
+  // Get real RP for the region if it hasn't be calculated before. After the
+  // initial schedule stage real RP will be collected after scheduling.
+  if (StageID == GCNSchedStageID::InitialSchedule)
+    DAG.computeBlockPressure(RegionIdx, CurrentMBB);
+}
+
+void GCNSchedStage::finalizeGCNRegion() {
+  DAG.Regions[RegionIdx] = std::make_pair(DAG.RegionBegin, DAG.RegionEnd);
+  DAG.RescheduleRegions[RegionIdx] = false;
+  if (S.HasExcessPressure)
+    DAG.RegionsWithHighRP[RegionIdx] = true;
+
+  // Revert scheduling if we have dropped occupancy or there is some other
+  // reason that the original schedule is better.
+  checkScheduling();
+
+  DAG.exitRegion();
+  RegionIdx++;
+}
+
+void InitialScheduleStage::finalizeGCNRegion() {
+  // Record which regions have clustered nodes for the next unclustered
+  // reschedule stage.
+  assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule);
+  if (S.HasClusteredNodes)
+    DAG.RegionsWithClusters[RegionIdx] = true;
+
+  GCNSchedStage::finalizeGCNRegion();
+}
+
+void GCNSchedStage::checkScheduling() {
+  // Check the results of scheduling.
+  PressureAfter = DAG.getRealRegPressure(RegionIdx);
+  LLVM_DEBUG(dbgs() << "Pressure after scheduling: ";
+             PressureAfter.print(dbgs()));
+
+  if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
+      PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
+    DAG.Pressure[RegionIdx] = PressureAfter;
+    DAG.RegionsWithMinOcc[RegionIdx] =
+        PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
+
+    // Early out if we have achieve the occupancy target.
+    LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
+    return;
+  }
+
+  unsigned WavesAfter =
+      std::min(S.getTargetOccupancy(), PressureAfter.getOccupancy(ST));
+  unsigned WavesBefore =
+      std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
+  LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
+                    << ", after " << WavesAfter << ".\n");
+
+  // We may not be able to keep the current target occupancy because of the just
+  // scheduled region. We might still be able to revert scheduling if the
+  // occupancy before was higher, or if the current schedule has register
+  // pressure higher than the excess limits which could lead to more spilling.
+  unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
+
+  // Allow memory bound functions to drop to 4 waves if not limited by an
+  // attribute.
+  if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
+      WavesAfter >= MFI.getMinAllowedOccupancy()) {
+    LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
+                      << MFI.getMinAllowedOccupancy() << " waves\n");
+    NewOccupancy = WavesAfter;
+  }
+
+  if (NewOccupancy < DAG.MinOccupancy) {
+    DAG.MinOccupancy = NewOccupancy;
+    MFI.limitOccupancy(DAG.MinOccupancy);
+    DAG.RegionsWithMinOcc.reset();
+    LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
+                      << DAG.MinOccupancy << ".\n");
+  }
+
+  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+  unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+  if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
+      PressureAfter.getAGPRNum() > MaxVGPRs ||
+      PressureAfter.getSGPRNum() > MaxSGPRs) {
+    DAG.RescheduleRegions[RegionIdx] = true;
+    DAG.RegionsWithHighRP[RegionIdx] = true;
+  }
+
+  // Revert if this region's schedule would cause a drop in occupancy or
+  // spilling.
+  if (shouldRevertScheduling(WavesAfter)) {
+    revertScheduling();
+  } else {
+    DAG.Pressure[RegionIdx] = PressureAfter;
+    DAG.RegionsWithMinOcc[RegionIdx] =
+        PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
+  }
+}
+
+bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
+  if (WavesAfter < DAG.MinOccupancy)
+    return true;
+
+  return false;
+}
+
+bool InitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
+  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+    return true;
+
+  if (mayCauseSpilling(WavesAfter))
+    return true;
+
+  assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule);
+  // Don't reschedule the region in the next stage if it doesn't have clusters.
+  if (!DAG.RegionsWithClusters[RegionIdx])
+    DAG.RescheduleRegions[RegionIdx] = false;
+
+  return false;
+}
+
+bool UnclusteredRescheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
+  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+    return true;
+
+  // If RP is not reduced in the unclustred reschedule stage, revert to the old
+  // schedule.
+  if (!PressureAfter.less(ST, PressureBefore)) {
+    LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
+    return true;
+  }
+
+  return false;
+}
+
+bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
+  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+    return true;
+
+  if (mayCauseSpilling(WavesAfter))
+    return true;
+
+  return false;
+}
+
+bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
+  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+    return true;
+
+  if (mayCauseSpilling(WavesAfter))
+    return true;
+
+  return false;
+}
+
+bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
+  if (WavesAfter <= MFI.getMinWavesPerEU() &&
+      !PressureAfter.less(ST, PressureBefore) &&
+      DAG.RescheduleRegions[RegionIdx]) {
+    LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
+    return true;
+  }
+
+  return false;
+}
+
+void GCNSchedStage::revertScheduling() {
+  DAG.RegionsWithMinOcc[RegionIdx] =
+      PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
+  LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
+  DAG.RescheduleRegions[RegionIdx] =
+      DAG.RegionsWithClusters[RegionIdx] ||
+      (nextStage(StageID)) != GCNSchedStageID::UnclusteredReschedule;
+  DAG.RegionEnd = DAG.RegionBegin;
+  int SkippedDebugInstr = 0;
+  for (MachineInstr *MI : Unsched) {
+    if (MI->isDebugInstr()) {
+      ++SkippedDebugInstr;
+      continue;
+    }
+
+    if (MI->getIterator() != DAG.RegionEnd) {
+      DAG.BB->remove(MI);
+      DAG.BB->insert(DAG.RegionEnd, MI);
+      if (!MI->isDebugInstr())
+        DAG.LIS->handleMove(*MI, true);
+    }
+
+    // Reset read-undef flags and update them later.
+    for (auto &Op : MI->operands())
+      if (Op.isReg() && Op.isDef())
+        Op.setIsUndef(false);
+    RegisterOperands RegOpers;
+    RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
+    if (!MI->isDebugInstr()) {
+      if (DAG.ShouldTrackLaneMasks) {
+        // Adjust liveness and add missing dead+read-undef flags.
+        SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
+        RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
+      } else {
+        // Adjust for missing dead-def flags.
+        RegOpers.detectDeadDefs(*MI, *DAG.LIS);
+      }
     }
-    finishBlock();
+    DAG.RegionEnd = MI->getIterator();
+    ++DAG.RegionEnd;
+    LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
+  }
+
+  // After reverting schedule, debug instrs will now be at the end of the block
+  // and RegionEnd will point to the first debug instr. Increment RegionEnd
+  // pass debug instrs to the actual end of the scheduling region.
+  while (SkippedDebugInstr-- > 0)
+    ++DAG.RegionEnd;
+
+  // If Unsched.front() instruction is a debug instruction, this will actually
+  // shrink the region since we moved all debug instructions to the end of the
+  // block. Find the first instruction that is not a debug instruction.
+  DAG.RegionBegin = Unsched.front()->getIterator();
+  if (DAG.RegionBegin->isDebugInstr()) {
+    for (MachineInstr *MI : Unsched) {
+      if (MI->isDebugInstr())
+        continue;
+      DAG.RegionBegin = MI->getIterator();
+      break;
+    }
+  }
+
+  // Then move the debug instructions back into their correct place and set
+  // RegionBegin and RegionEnd if needed.
+  DAG.placeDebugValues();
 
-    if (Stage == UnclusteredReschedule)
-      SavedMutations.swap(Mutations);
-  } while (Stage != LastStage);
+  DAG.Regions[RegionIdx] = std::make_pair(DAG.RegionBegin, DAG.RegionEnd);
 }
 
-void GCNScheduleDAGMILive::collectRematerializableInstructions() {
-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
-  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+void PreRARematStage::collectRematerializableInstructions() {
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
+  for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
     Register Reg = Register::index2VirtReg(I);
-    if (!LIS->hasInterval(Reg))
+    if (!DAG.LIS->hasInterval(Reg))
       continue;
 
     // TODO: Handle AGPR and SGPR rematerialization
-    if (!SRI->isVGPRClass(MRI.getRegClass(Reg)) || !MRI.hasOneDef(Reg) ||
-        !MRI.hasOneNonDBGUse(Reg))
+    if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
+        !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
       continue;
 
-    MachineOperand *Op = MRI.getOneDef(Reg);
+    MachineOperand *Op = DAG.MRI.getOneDef(Reg);
     MachineInstr *Def = Op->getParent();
     if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
       continue;
 
-    MachineInstr *UseI = &*MRI.use_instr_nodbg_begin(Reg);
+    MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
     if (Def->getParent() == UseI->getParent())
       continue;
 
@@ -744,10 +905,10 @@ void GCNScheduleDAGMILive::collectRematerializableInstructions() {
     // live-through or used inside regions at MinOccupancy. This means that the
     // register must be in the live-in set for the region.
     bool AddedToRematList = false;
-    for (unsigned I = 0, E = Regions.size(); I != E; ++I) {
-      auto It = LiveIns[I].find(Reg);
-      if (It != LiveIns[I].end() && !It->second.none()) {
-        if (RegionsWithMinOcc[I]) {
+    for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+      auto It = DAG.LiveIns[I].find(Reg);
+      if (It != DAG.LiveIns[I].end() && !It->second.none()) {
+        if (DAG.RegionsWithMinOcc[I]) {
           RematerializableInsts[I][Def] = UseI;
           AddedToRematList = true;
         }
@@ -762,8 +923,8 @@ void GCNScheduleDAGMILive::collectRematerializableInstructions() {
   }
 }
 
-bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
-                                                   const TargetInstrInfo *TII) {
+bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
+                                              const TargetInstrInfo *TII) {
   // Temporary copies of cached variables we will be modifying and replacing if
   // sinking succeeds.
   SmallVector<
@@ -772,9 +933,10 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
   DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
   DenseMap<unsigned, GCNRegPressure> NewPressure;
   BitVector NewRescheduleRegions;
+  LiveIntervals *LIS = DAG.LIS;
 
-  NewRegions.resize(Regions.size());
-  NewRescheduleRegions.resize(Regions.size());
+  NewRegions.resize(DAG.Regions.size());
+  NewRescheduleRegions.resize(DAG.Regions.size());
 
   // Collect only regions that has a rematerializable def as a live-in.
   SmallSet<unsigned, 16> ImpactedRegions;
@@ -784,16 +946,16 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
   // Make copies of register pressure and live-ins cache that will be updated
   // as we rematerialize.
   for (auto Idx : ImpactedRegions) {
-    NewPressure[Idx] = Pressure[Idx];
-    NewLiveIns[Idx] = LiveIns[Idx];
+    NewPressure[Idx] = DAG.Pressure[Idx];
+    NewLiveIns[Idx] = DAG.LiveIns[Idx];
   }
-  NewRegions = Regions;
+  NewRegions = DAG.Regions;
   NewRescheduleRegions.reset();
 
   DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
   bool Improved = false;
   for (auto I : ImpactedRegions) {
-    if (!RegionsWithMinOcc[I])
+    if (!DAG.RegionsWithMinOcc[I])
       continue;
 
     Improved = false;
@@ -802,12 +964,12 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
 
     // TODO: Handle occupancy drop due to AGPR and SGPR.
     // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
-    if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy)
+    if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
       break;
 
     // The occupancy of this region could have been improved by a previous
     // iteration's sinking of defs.
-    if (NewPressure[I].getOccupancy(ST) > MinOccupancy) {
+    if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
       NewRescheduleRegions[I] = true;
       Improved = true;
       continue;
@@ -827,7 +989,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
     unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
     // If in the most optimistic scenario, we cannot improve occupancy, then do
     // not attempt to sink any instructions.
-    if (OptimisticOccupancy <= MinOccupancy)
+    if (OptimisticOccupancy <= DAG.MinOccupancy)
       break;
 
     unsigned ImproveOccupancy = 0;
@@ -842,7 +1004,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
       // call LiveRangeEdit::allUsesAvailableAt() and
       // LiveRangeEdit::canRematerializeAt().
       TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
-                         Def->getOperand(0).getSubReg(), *Def, *TRI);
+                         Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
       MachineInstr *NewMI = &*(--InsertPos);
       LIS->InsertMachineInstrInMaps(*NewMI);
       LIS->removeInterval(Reg);
@@ -851,11 +1013,11 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
 
       // Update region boundaries in scheduling region we sinked from since we
       // may sink an instruction that was at the beginning or end of its region
-      updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
-                             /*Removing =*/true);
+      DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
+                                 /*Removing =*/true);
 
       // Update region boundaries in region we sinked to.
-      updateRegionBoundaries(NewRegions, InsertPos, NewMI);
+      DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
 
       LaneBitmask PrevMask = NewLiveIns[I][Reg];
       // FIXME: Also update cached pressure for where the def was sinked from.
@@ -863,9 +1025,9 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
       // the reg from all regions as a live-in.
       for (auto Idx : RematDefToLiveInRegions[Def]) {
         NewLiveIns[Idx].erase(Reg);
-        if (InsertPos->getParent() != Regions[Idx].first->getParent()) {
+        if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
           // Def is live-through and not used in this block.
-          NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), MRI);
+          NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
         } else {
           // Def is used and rematerialized into this block.
           GCNDownwardRPTracker RPT(*LIS);
@@ -879,7 +1041,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
 
       SinkedDefs.push_back(Def);
       ImproveOccupancy = NewPressure[I].getOccupancy(ST);
-      if (ImproveOccupancy > MinOccupancy)
+      if (ImproveOccupancy > DAG.MinOccupancy)
         break;
     }
 
@@ -888,7 +1050,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
       for (auto TrackedIdx : RematDefToLiveInRegions[Def])
         RematerializableInsts[TrackedIdx].erase(Def);
 
-    if (ImproveOccupancy <= MinOccupancy)
+    if (ImproveOccupancy <= DAG.MinOccupancy)
       break;
 
     NewRescheduleRegions[I] = true;
@@ -917,7 +1079,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
     MachineInstr *OldMI = Entry.second;
 
     // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
-    BBLiveInMap.erase(OldMI);
+    DAG.BBLiveInMap.erase(OldMI);
 
     // Remove OldMI and update LIS
     Register Reg = MI->getOperand(0).getReg();
@@ -929,22 +1091,22 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
 
   // Update live-ins, register pressure, and regions caches.
   for (auto Idx : ImpactedRegions) {
-    LiveIns[Idx] = NewLiveIns[Idx];
-    Pressure[Idx] = NewPressure[Idx];
-    MBBLiveIns.erase(Regions[Idx].first->getParent());
+    DAG.LiveIns[Idx] = NewLiveIns[Idx];
+    DAG.Pressure[Idx] = NewPressure[Idx];
+    DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
   }
-  Regions = NewRegions;
-  RescheduleRegions = NewRescheduleRegions;
+  DAG.Regions = NewRegions;
+  DAG.RescheduleRegions = NewRescheduleRegions;
 
   SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-  MFI.increaseOccupancy(MF, ++MinOccupancy);
+  MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
 
   return true;
 }
 
 // Copied from MachineLICM
-bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI) {
-  if (!TII->isTriviallyReMaterializable(MI))
+bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
+  if (!DAG.TII->isTriviallyReMaterializable(MI))
     return false;
 
   for (const MachineOperand &MO : MI.operands())

diff  --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index c3db849cf81a..7aadf89e0bf7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -28,8 +28,6 @@ class GCNSubtarget;
 /// heuristics to determine excess/critical pressure sets.  Its goal is to
 /// maximize kernel occupancy (i.e. maximum number of waves per simd).
 class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
-  friend class GCNScheduleDAGMILive;
-
   SUnit *pickNodeBidirectional(bool &IsTopNode);
 
   void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
@@ -42,15 +40,18 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
                      unsigned SGPRPressure, unsigned VGPRPressure);
 
   std::vector<unsigned> Pressure;
+
   std::vector<unsigned> MaxPressure;
 
   unsigned SGPRExcessLimit;
+
   unsigned VGPRExcessLimit;
-  unsigned SGPRCriticalLimit;
-  unsigned VGPRCriticalLimit;
 
   unsigned TargetOccupancy;
 
+  MachineFunction *MF;
+
+public:
   // schedule() have seen a clustered memory operation. Set it to false
   // before a region scheduling to know if the region had such clusters.
   bool HasClusteredNodes;
@@ -59,28 +60,53 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
   // register pressure for actual scheduling heuristics.
   bool HasExcessPressure;
 
-  MachineFunction *MF;
+  unsigned SGPRCriticalLimit;
+
+  unsigned VGPRCriticalLimit;
 
-public:
   GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
 
   SUnit *pickNode(bool &IsTopNode) override;
 
   void initialize(ScheduleDAGMI *DAG) override;
 
+  unsigned getTargetOccupancy() { return TargetOccupancy; }
+
   void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }
 };
 
-class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
+enum class GCNSchedStageID : unsigned {
+  InitialSchedule = 0,
+  UnclusteredReschedule = 1,
+  ClusteredLowOccupancyReschedule = 2,
+  PreRARematerialize = 3,
+  LastStage = PreRARematerialize
+};
+
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
+#endif
+
+inline GCNSchedStageID &operator++(GCNSchedStageID &Stage, int) {
+  assert(Stage != GCNSchedStageID::PreRARematerialize);
+  Stage = static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1);
+  return Stage;
+}
+
+inline GCNSchedStageID nextStage(const GCNSchedStageID Stage) {
+  return static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1);
+}
 
-  enum : unsigned {
-    Collect,
-    InitialSchedule,
-    UnclusteredReschedule,
-    ClusteredLowOccupancyReschedule,
-    PreRARematerialize,
-    LastStage = PreRARematerialize
-  };
+inline bool operator>(GCNSchedStageID &LHS, GCNSchedStageID &RHS) {
+  return static_cast<unsigned>(LHS) > static_cast<unsigned>(RHS);
+}
+
+class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
+  friend class GCNSchedStage;
+  friend class InitialScheduleStage;
+  friend class UnclusteredRescheduleStage;
+  friend class ClusteredLowOccStage;
+  friend class PreRARematStage;
 
   const GCNSubtarget &ST;
 
@@ -92,12 +118,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   // Minimal real occupancy recorder for the function.
   unsigned MinOccupancy;
 
-  // Scheduling stage number.
-  unsigned Stage;
-
-  // Current region index.
-  size_t RegionIdx;
-
   // Vector of regions recorder for later rescheduling
   SmallVector<std::pair<MachineBasicBlock::iterator,
                         MachineBasicBlock::iterator>, 32> Regions;
@@ -121,6 +141,148 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   // Region pressure cache.
   SmallVector<GCNRegPressure, 32> Pressure;
 
+  // Temporary basic block live-in cache.
+  DenseMap<const MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBLiveIns;
+
+  DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
+
+  DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
+
+  // Return current region pressure.
+  GCNRegPressure getRealRegPressure(unsigned RegionIdx) const;
+
+  // Compute and cache live-ins and pressure for all regions in block.
+  void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);
+
+  // Update region boundaries when removing MI or inserting NewMI before MI.
+  void updateRegionBoundaries(
+      SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
+                                MachineBasicBlock::iterator>> &RegionBoundaries,
+      MachineBasicBlock::iterator MI, MachineInstr *NewMI,
+      bool Removing = false);
+
+  void runSchedStages();
+
+public:
+  GCNScheduleDAGMILive(MachineSchedContext *C,
+                       std::unique_ptr<MachineSchedStrategy> S);
+
+  void schedule() override;
+
+  void finalizeSchedule() override;
+};
+
+// GCNSchedStrategy applies multiple scheduling stages to a function.
+class GCNSchedStage {
+protected:
+  GCNScheduleDAGMILive &DAG;
+
+  GCNMaxOccupancySchedStrategy &S;
+
+  MachineFunction &MF;
+
+  SIMachineFunctionInfo &MFI;
+
+  const GCNSubtarget &ST;
+
+  const GCNSchedStageID StageID;
+
+  // The current block being scheduled.
+  MachineBasicBlock *CurrentMBB = nullptr;
+
+  // Current region index.
+  unsigned RegionIdx = 0;
+
+  // Record the original order of instructions before scheduling.
+  std::vector<MachineInstr *> Unsched;
+
+  // RP before scheduling the current region.
+  GCNRegPressure PressureBefore;
+
+  // RP after scheduling the current region.
+  GCNRegPressure PressureAfter;
+
+  GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG);
+
+public:
+  // Initialize state for a scheduling stage. Returns false if the current stage
+  // should be skipped.
+  virtual bool initGCNSchedStage();
+
+  // Finalize state after finishing a scheduling pass on the function.
+  virtual void finalizeGCNSchedStage();
+
+  // Setup for scheduling a region. Returns false if the current region should
+  // be skipped.
+  virtual bool initGCNRegion();
+
+  // Track whether a new region is also a new MBB.
+  void setupNewBlock();
+
+  // Finalize state after scheudling a region.
+  virtual void finalizeGCNRegion();
+
+  // Check result of scheduling.
+  void checkScheduling();
+
+  // Returns true if scheduling should be reverted.
+  virtual bool shouldRevertScheduling(unsigned WavesAfter);
+
+  // Returns true if the new schedule may result in more spilling.
+  bool mayCauseSpilling(unsigned WavesAfter);
+
+  // Attempt to revert scheduling for this region.
+  void revertScheduling();
+
+  void advanceRegion() { RegionIdx++; }
+
+  virtual ~GCNSchedStage() = default;
+};
+
+class InitialScheduleStage : public GCNSchedStage {
+public:
+  void finalizeGCNRegion() override;
+
+  bool shouldRevertScheduling(unsigned WavesAfter) override;
+
+  InitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+      : GCNSchedStage(StageID, DAG) {}
+};
+
+class UnclusteredRescheduleStage : public GCNSchedStage {
+private:
+  std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
+
+public:
+  bool initGCNSchedStage() override;
+
+  void finalizeGCNSchedStage() override;
+
+  bool initGCNRegion() override;
+
+  bool shouldRevertScheduling(unsigned WavesAfter) override;
+
+  UnclusteredRescheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+      : GCNSchedStage(StageID, DAG) {}
+};
+
+// Retry function scheduling if we found resulting occupancy and it is
+// lower than used for other scheduling passes. This will give more freedom
+// to schedule low register pressure blocks.
+class ClusteredLowOccStage : public GCNSchedStage {
+public:
+  bool initGCNSchedStage() override;
+
+  bool initGCNRegion() override;
+
+  bool shouldRevertScheduling(unsigned WavesAfter) override;
+
+  ClusteredLowOccStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+      : GCNSchedStage(StageID, DAG) {}
+};
+
+class PreRARematStage : public GCNSchedStage {
+private:
   // Each region at MinOccupancy will have their own list of trivially
   // rematerializable instructions we can remat to reduce RP. The list maps an
   // instruction to the position we should remat before, usually the MI using
@@ -132,12 +294,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   // that has the defined reg as a live-in.
   DenseMap<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions;
 
-  // Temporary basic block live-in cache.
-  DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;
-
-  DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
-  DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
-
   // Collect all trivially rematerializable VGPR instructions with a single def
   // and single use outside the defining block into RematerializableInsts.
   void collectRematerializableInstructions();
@@ -150,26 +306,15 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
                                const TargetInstrInfo *TII);
 
-  // Return current region pressure.
-  GCNRegPressure getRealRegPressure() const;
-
-  // Compute and cache live-ins and pressure for all regions in block.
-  void computeBlockPressure(const MachineBasicBlock *MBB);
-
-  // Update region boundaries when removing MI or inserting NewMI before MI.
-  void updateRegionBoundaries(
-      SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
-                                MachineBasicBlock::iterator>> &RegionBoundaries,
-      MachineBasicBlock::iterator MI, MachineInstr *NewMI,
-      bool Removing = false);
-
 public:
-  GCNScheduleDAGMILive(MachineSchedContext *C,
-                       std::unique_ptr<MachineSchedStrategy> S);
+  bool initGCNSchedStage() override;
 
-  void schedule() override;
+  bool initGCNRegion() override;
 
-  void finalizeSchedule() override;
+  bool shouldRevertScheduling(unsigned WavesAfter) override;
+
+  PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+      : GCNSchedStage(StageID, DAG) {}
 };
 
 } // End namespace llvm