[llvm] 7556626 - [CodeGen][MachinePipeliner] Limit register pressure when scheduling (#74807)

Mon Jan 22 00:06:42 PST 2024

Author: Ryotaro KASUGA
Date: 2024-01-22T17:06:37+09:00
New Revision: 7556626dcff15c8cc5160078a4d6ed2469eed81b

URL: https://github.com/llvm/llvm-project/commit/7556626dcff15c8cc5160078a4d6ed2469eed81b
DIFF: https://github.com/llvm/llvm-project/commit/7556626dcff15c8cc5160078a4d6ed2469eed81b.diff

LOG: [CodeGen][MachinePipeliner] Limit register pressure when scheduling (#74807)

In software pipelining, when searching for the Initiation Interval (II),
`MachinePipeliner` tries to reduce register pressure, but doesn't check
how many variables can actually be alive at the same time. As a result,
a lot of register spills/fills can be generated after register
allocation, which might cause performance degradation. To prevent such
cases, this patch adds a check phase that calculates the maximum
register pressure of the scheduled loop and reject it if the pressure is
too high. This can be enabled this by specifying
`pipeliner-register-pressure`. Additionally, an II search range is
currently fixed at 10, which is too small to find a schedule when the
above algorithm is applied. Therefore this patch also adds a new option
`pipeliner-ii-search-range` to specify the length of the range to
search. There is one more new option
`pipeliner-register-pressure-margin`, which can be used to estimate a
register pressure limit less than actual for conservative analysis.

Discourse thread:
https://discourse.llvm.org/t/considering-register-pressure-when-deciding-initiation-interval-in-machinepipeliner/74725

Added: 
    llvm/test/CodeGen/PowerPC/sms-regpress.mir

Modified: 
    llvm/include/llvm/CodeGen/MachinePipeliner.h
    llvm/lib/CodeGen/MachinePipeliner.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index 04055ba9732dd4..8f0a17cf99967b 100644

--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -273,8 +273,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
 
   /// Return the new base register that was stored away for the changed
   /// instruction.
-  unsigned getInstrBaseReg(SUnit *SU) {
-    DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
+  unsigned getInstrBaseReg(SUnit *SU) const {
+    DenseMap<SUnit *, std::pair<unsigned, int64_t>>::const_iterator It =
         InstrChanges.find(SU);
     if (It != InstrChanges.end())
       return It->second.first;
@@ -639,16 +639,20 @@ class SMSchedule {
   computeUnpipelineableNodes(SwingSchedulerDAG *SSD,
                              TargetInstrInfo::PipelinerLoopInfo *PLI);
 
+  std::deque<SUnit *>
+  reorderInstructions(const SwingSchedulerDAG *SSD,
+                      const std::deque<SUnit *> &Instrs) const;
+
   bool
   normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD,
                                     TargetInstrInfo::PipelinerLoopInfo *PLI);
   bool isValidSchedule(SwingSchedulerDAG *SSD);
   void finalizeSchedule(SwingSchedulerDAG *SSD);
-  void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
-                       std::deque<SUnit *> &Insts);
-  bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi);
-  bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Def,
-                             MachineOperand &MO);
+  void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU,
+                       std::deque<SUnit *> &Insts) const;
+  bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const;
+  bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def,
+                             MachineOperand &MO) const;
   void print(raw_ostream &os) const;
   void dump() const;
 };

diff  --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 8cd7f4ebe88d96..5c9f0f1703a6e4 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -35,6 +35,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -60,9 +61,12 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ModuloSchedule.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -174,6 +178,20 @@ static cl::opt<bool> ExperimentalCodeGen(
     cl::desc(
         "Use the experimental peeling code generator for software pipelining"));
 
+static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
+                                     cl::desc("Range to search for II"),
+                                     cl::Hidden, cl::init(10));
+
+static cl::opt<bool>
+    LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
+                     cl::desc("Limit register pressure of scheduled loop"));
+
+static cl::opt<int>
+    RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
+                      cl::init(5),
+                      cl::desc("Margin representing the unused percentage of "
+                               "the register pressure limit"));
+
 namespace llvm {
 
 // A command line option to enable the CopyToPhi DAG mutation.
@@ -484,7 +502,7 @@ void SwingSchedulerDAG::setMAX_II() {
   else if (II_setByPragma > 0)
     MAX_II = II_setByPragma;
   else
-    MAX_II = MII + 10;
+    MAX_II = MII + SwpIISearchRange;
 }
 
 /// We override the schedule function in ScheduleDAGInstrs to implement the
@@ -695,7 +713,8 @@ static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
 }
 
 /// Return the Phi register value that comes the loop block.
-static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
+static unsigned getLoopPhiReg(const MachineInstr &Phi,
+                              const MachineBasicBlock *LoopBB) {
   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
     if (Phi.getOperand(i + 1).getMBB() == LoopBB)
       return Phi.getOperand(i).getReg();
@@ -996,6 +1015,41 @@ void SwingSchedulerDAG::changeDependences() {
   }
 }
 
+/// Create an instruction stream that represents a single iteration and stage of
+/// each instruction. This function 
diff ers from SMSchedule::finalizeSchedule in
+/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
+/// function is an approximation of SMSchedule::finalizeSchedule with all
+/// non-const operations removed.
+static void computeScheduledInsts(const SwingSchedulerDAG *SSD,
+                                  SMSchedule &Schedule,
+                                  std::vector<MachineInstr *> &OrderedInsts,
+                                  DenseMap<MachineInstr *, unsigned> &Stages) {
+  DenseMap<int, std::deque<SUnit *>> Instrs;
+
+  // Move all instructions to the first stage from the later stages.
+  for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
+       ++Cycle) {
+    for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
+         Stage <= LastStage; ++Stage) {
+      for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
+               Cycle + Stage * Schedule.getInitiationInterval()))) {
+        Instrs[Cycle].push_front(SU);
+      }
+    }
+  }
+
+  for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
+       ++Cycle) {
+    std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
+    CycleInstrs = std::move(Schedule.reorderInstructions(SSD, CycleInstrs));
+    for (SUnit *SU : CycleInstrs) {
+      MachineInstr *MI = SU->getInstr();
+      OrderedInsts.push_back(MI);
+      Stages[MI] = Schedule.stageScheduled(SU);
+    }
+  }
+}
+
 namespace {
 
 // FuncUnitSorter - Comparison operator used to sort instructions by
@@ -1102,6 +1156,375 @@ struct FuncUnitSorter {
   }
 };
 
+/// Calculate the maximum register pressure of the scheduled instructions stream
+class HighRegisterPressureDetector {
+  MachineBasicBlock *OrigMBB;
+  const MachineFunction &MF;
+  const MachineRegisterInfo &MRI;
+  const TargetRegisterInfo *TRI;
+
+  const unsigned PSetNum;
+
+  // Indexed by PSet ID
+  // InitSetPressure takes into account the register pressure of live-in
+  // registers. It's not depend on how the loop is scheduled, so it's enough to
+  // calculate them once at the beginning.
+  std::vector<unsigned> InitSetPressure;
+
+  // Indexed by PSet ID
+  // Upper limit for each register pressure set
+  std::vector<unsigned> PressureSetLimit;
+
+  DenseMap<MachineInstr *, RegisterOperands> ROMap;
+
+  using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, 4>>;
+
+public:
+  using OrderedInstsTy = std::vector<MachineInstr *>;
+  using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
+
+private:
+  static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
+    if (Pressures.size() == 0) {
+      dbgs() << "[]";
+    } else {
+      char Prefix = '[';
+      for (unsigned P : Pressures) {
+        dbgs() << Prefix << P;
+        Prefix = ' ';
+      }
+      dbgs() << ']';
+    }
+  }
+
+  void dumpPSet(Register Reg) const {
+    dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
+    for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
+         ++PSetIter) {
+      dbgs() << *PSetIter << ' ';
+    }
+    dbgs() << '\n';
+  }
+
+  void increaseRegisterPressure(std::vector<unsigned> &Pressure,
+                                Register Reg) const {
+    auto PSetIter = MRI.getPressureSets(Reg);
+    unsigned Weight = PSetIter.getWeight();
+    for (; PSetIter.isValid(); ++PSetIter)
+      Pressure[*PSetIter] += Weight;
+  }
+
+  void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
+                                Register Reg) const {
+    auto PSetIter = MRI.getPressureSets(Reg);
+    unsigned Weight = PSetIter.getWeight();
+    for (; PSetIter.isValid(); ++PSetIter) {
+      auto &P = Pressure[*PSetIter];
+      assert(P >= Weight &&
+             "register pressure must be greater than or equal weight");
+      P -= Weight;
+    }
+  }
+
+  // Return true if Reg is fixed one, for example, stack pointer
+  bool isFixedRegister(Register Reg) const {
+    return Reg.isPhysical() && TRI->isFixedRegister(MF, Reg.asMCReg());
+  }
+
+  bool isDefinedInThisLoop(Register Reg) const {
+    return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
+  }
+
+  // Search for live-in variables. They are factored into the register pressure
+  // from the begining. Live-in variables used by every iteration should be
+  // considered as alive throughout the loop. For example, the variable `c` in
+  // following code. \code
+  //   int c = ...;
+  //   for (int i = 0; i < n; i++)
+  //     a[i] += b[i] + c;
+  // \endcode
+  void computeLiveIn() {
+    DenseSet<Register> Used;
+    for (auto &MI : *OrigMBB) {
+      if (MI.isDebugInstr())
+        continue;
+      for (auto Use : ROMap[&MI].Uses) {
+        auto Reg = Use.RegUnit;
+        // Ignore the variable that appears only on one side of phi instruction
+        // because it's used only at the first iteration.
+        if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
+          continue;
+        if (isFixedRegister(Reg))
+          continue;
+        if (isDefinedInThisLoop(Reg))
+          continue;
+        Used.insert(Reg);
+      }
+    }
+
+    for (auto LiveIn : Used)
+      increaseRegisterPressure(InitSetPressure, LiveIn);
+  }
+
+  // Calculate the upper limit of each pressure set
+  void computePressureSetLimit(const RegisterClassInfo &RCI) {
+    for (unsigned PSet = 0; PSet < PSetNum; PSet++)
+      PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
+
+    // We assume fixed registers, such as stack pointer, are already in use.
+    // Therefore subtracting the weight of the fixed registers from the limit of
+    // each pressure set in advance.
+    SmallDenseSet<Register, 8> FixedRegs;
+    for (const TargetRegisterClass *TRC : TRI->regclasses()) {
+      for (const MCPhysReg Reg : *TRC)
+        if (isFixedRegister(Reg))
+          FixedRegs.insert(Reg);
+    }
+
+    LLVM_DEBUG({
+      for (auto Reg : FixedRegs) {
+        dbgs() << printReg(Reg, TRI, 0, &MRI) << ": [";
+        const int *Sets = TRI->getRegUnitPressureSets(Reg);
+        for (; *Sets != -1; Sets++) {
+          dbgs() << TRI->getRegPressureSetName(*Sets) << ", ";
+        }
+        dbgs() << "]\n";
+      }
+    });
+
+    for (auto Reg : FixedRegs) {
+      LLVM_DEBUG(dbgs() << "fixed register: " << printReg(Reg, TRI, 0, &MRI)
+                        << "\n");
+      auto PSetIter = MRI.getPressureSets(Reg);
+      unsigned Weight = PSetIter.getWeight();
+      for (; PSetIter.isValid(); ++PSetIter) {
+        unsigned &Limit = PressureSetLimit[*PSetIter];
+        assert(Limit >= Weight &&
+               "register pressure limit must be greater than or equal weight");
+        Limit -= Weight;
+        LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit
+                          << " (decreased by " << Weight << ")\n");
+      }
+    }
+  }
+
+  // There are two patterns of last-use.
+  //   - by an instruction of the current iteration
+  //   - by a phi instruction of the next iteration (loop carried value)
+  //
+  // Furthermore, following two groups of instructions are executed
+  // simultaneously
+  //   - next iteration's phi instructions in i-th stage
+  //   - current iteration's instructions in i+1-th stage
+  //
+  // This function calculates the last-use of each register while taking into
+  // account the above two patterns.
+  Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
+                                   Instr2StageTy &Stages) const {
+    // We treat virtual registers that are defined and used in this loop.
+    // Following virtual register will be ignored
+    //   - live-in one
+    //   - defined but not used in the loop (potentially live-out)
+    DenseSet<Register> TargetRegs;
+    const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
+      if (isDefinedInThisLoop(Reg))
+        TargetRegs.insert(Reg);
+    };
+    for (MachineInstr *MI : OrderedInsts) {
+      if (MI->isPHI()) {
+        Register Reg = getLoopPhiReg(*MI, OrigMBB);
+        UpdateTargetRegs(Reg);
+      } else {
+        for (auto Use : ROMap.find(MI)->getSecond().Uses)
+          UpdateTargetRegs(Use.RegUnit);
+      }
+    }
+
+    const auto InstrScore = [&Stages](MachineInstr *MI) {
+      return Stages[MI] + MI->isPHI();
+    };
+
+    DenseMap<Register, MachineInstr *> LastUseMI;
+    for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
+      for (auto Use : ROMap.find(MI)->getSecond().Uses) {
+        auto Reg = Use.RegUnit;
+        if (!TargetRegs.contains(Reg))
+          continue;
+        auto Ite = LastUseMI.find(Reg);
+        if (Ite == LastUseMI.end()) {
+          LastUseMI[Reg] = MI;
+        } else {
+          MachineInstr *Orig = Ite->second;
+          MachineInstr *New = MI;
+          if (InstrScore(Orig) < InstrScore(New))
+            LastUseMI[Reg] = New;
+        }
+      }
+    }
+
+    Instr2LastUsesTy LastUses;
+    for (auto &Entry : LastUseMI)
+      LastUses[Entry.second].insert(Entry.first);
+    return LastUses;
+  }
+
+  // Compute the maximum register pressure of the kernel. We'll simulate #Stage
+  // iterations and check the register pressure at the point where all stages
+  // overlapping.
+  //
+  // An example of unrolled loop where #Stage is 4..
+  // Iter   i+0 i+1 i+2 i+3
+  // ------------------------
+  // Stage   0
+  // Stage   1   0
+  // Stage   2   1   0
+  // Stage   3   2   1   0  <- All stages overlap
+  //
+  std::vector<unsigned>
+  computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
+                        Instr2StageTy &Stages,
+                        const unsigned StageCount) const {
+    using RegSetTy = SmallDenseSet<Register, 16>;
+
+    // Indexed by #Iter. To treat "local" variables of each stage separately, we
+    // manage the liveness of the registers independently by iterations.
+    SmallVector<RegSetTy> LiveRegSets(StageCount);
+
+    auto CurSetPressure = InitSetPressure;
+    auto MaxSetPressure = InitSetPressure;
+    auto LastUses = std::move(computeLastUses(OrderedInsts, Stages));
+
+    LLVM_DEBUG({
+      dbgs() << "Ordered instructions:\n";
+      for (MachineInstr *MI : OrderedInsts) {
+        dbgs() << "Stage " << Stages[MI] << ": ";
+        MI->dump();
+      }
+    });
+
+    const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
+                                                   Register Reg) {
+      if (!Reg.isValid() || isFixedRegister(Reg))
+        return;
+
+      bool Inserted = RegSet.insert(Reg).second;
+      if (!Inserted)
+        return;
+
+      LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
+      increaseRegisterPressure(CurSetPressure, Reg);
+      LLVM_DEBUG(dumpPSet(Reg));
+    };
+
+    const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
+                                                  Register Reg) {
+      if (!Reg.isValid() || isFixedRegister(Reg))
+        return;
+
+      // live-in register
+      if (!RegSet.contains(Reg))
+        return;
+
+      LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
+      RegSet.erase(Reg);
+      decreaseRegisterPressure(CurSetPressure, Reg);
+      LLVM_DEBUG(dumpPSet(Reg));
+    };
+
+    for (unsigned I = 0; I < StageCount; I++) {
+      for (MachineInstr *MI : OrderedInsts) {
+        const auto Stage = Stages[MI];
+        if (I < Stage)
+          continue;
+
+        const unsigned Iter = I - Stage;
+
+        for (auto Def : ROMap.find(MI)->getSecond().Defs)
+          InsertReg(LiveRegSets[Iter], Def.RegUnit);
+
+        for (auto LastUse : LastUses[MI]) {
+          if (MI->isPHI()) {
+            if (Iter != 0)
+              EraseReg(LiveRegSets[Iter - 1], LastUse);
+          } else {
+            EraseReg(LiveRegSets[Iter], LastUse);
+          }
+        }
+
+        for (unsigned PSet = 0; PSet < PSetNum; PSet++)
+          MaxSetPressure[PSet] =
+              std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
+
+        LLVM_DEBUG({
+          dbgs() << "CurSetPressure=";
+          dumpRegisterPressures(CurSetPressure);
+          dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
+          MI->dump();
+        });
+      }
+    }
+
+    return MaxSetPressure;
+  }
+
+public:
+  HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
+                               const MachineFunction &MF)
+      : OrigMBB(OrigMBB), MF(MF), MRI(MF.getRegInfo()),
+        TRI(MF.getSubtarget().getRegisterInfo()),
+        PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
+        PressureSetLimit(PSetNum, 0) {}
+
+  // Used to calculate register pressure, which is independent of loop
+  // scheduling.
+  void init(const RegisterClassInfo &RCI) {
+    for (MachineInstr &MI : *OrigMBB) {
+      if (MI.isDebugInstr())
+        continue;
+      ROMap[&MI].collect(MI, *TRI, MRI, false, true);
+    }
+
+    computeLiveIn();
+    computePressureSetLimit(RCI);
+  }
+
+  // Calculate the maximum register pressures of the loop and check if they
+  // exceed the limit
+  bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
+              const unsigned MaxStage) const {
+    assert(0 <= RegPressureMargin && RegPressureMargin <= 100 &&
+           "the percentage of the margin must be between 0 to 100");
+
+    OrderedInstsTy OrderedInsts;
+    Instr2StageTy Stages;
+    computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
+    const auto MaxSetPressure =
+        std::move(computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1));
+
+    LLVM_DEBUG({
+      dbgs() << "Dump MaxSetPressure:\n";
+      for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
+        dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
+      }
+      dbgs() << '\n';
+    });
+
+    for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
+      unsigned Limit = PressureSetLimit[PSet];
+      unsigned Margin = Limit * RegPressureMargin / 100;
+      LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
+                        << " Margin=" << Margin << "\n");
+      if (Limit < MaxSetPressure[PSet] + Margin) {
+        LLVM_DEBUG(
+            dbgs()
+            << "Rejected the schedule because of too high register pressure\n");
+        return true;
+      }
+    }
+    return false;
+  }
+};
+
 } // end anonymous namespace
 
 /// Calculate the resource constrained minimum initiation interval for the
@@ -1967,6 +2390,12 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
   }
 
   bool scheduleFound = false;
+  std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
+  if (LimitRegPressure) {
+    HRPDetector =
+        std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
+    HRPDetector->init(RegClassInfo);
+  }
   // Keep increasing II until a valid schedule is found.
   for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
     Schedule.reset();
@@ -2044,6 +2473,12 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
     // If a schedule is found, check if it is a valid schedule too.
     if (scheduleFound)
       scheduleFound = Schedule.isValidSchedule(this);
+
+    // If a schedule was found and the option is enabled, check if the schedule
+    // might generate additional register spills/fills.
+    if (scheduleFound && LimitRegPressure)
+      scheduleFound =
+          !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
   }
 
   LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
@@ -2483,8 +2918,8 @@ void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
 /// Order the instructions within a cycle so that the definitions occur
 /// before the uses. Returns true if the instruction is added to the start
 /// of the list, or false if added to the end.
-void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
-                                 std::deque<SUnit *> &Insts) {
+void SMSchedule::orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU,
+                                 std::deque<SUnit *> &Insts) const {
   MachineInstr *MI = SU->getInstr();
   bool OrderBeforeUse = false;
   bool OrderAfterDef = false;
@@ -2611,7 +3046,8 @@ void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
 }
 
 /// Return true if the scheduled Phi has a loop carried operand.
-bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
+bool SMSchedule::isLoopCarried(const SwingSchedulerDAG *SSD,
+                               MachineInstr &Phi) const {
   if (!Phi.isPHI())
     return false;
   assert(Phi.isPHI() && "Expecting a Phi.");
@@ -2639,8 +3075,9 @@ bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
 ///  (MO)   = v1
 /// If MO appears before Def, then v1 and v3 may get assigned to the same
 /// register.
-bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
-                                       MachineInstr *Def, MachineOperand &MO) {
+bool SMSchedule::isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD,
+                                       MachineInstr *Def,
+                                       MachineOperand &MO) const {
   if (!MO.isReg())
     return false;
   if (Def->isPHI())
@@ -2895,6 +3332,23 @@ void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
   }
 }
 
+std::deque<SUnit *>
+SMSchedule::reorderInstructions(const SwingSchedulerDAG *SSD,
+                                const std::deque<SUnit *> &Instrs) const {
+  std::deque<SUnit *> NewOrderPhi;
+  for (SUnit *SU : Instrs) {
+    if (SU->getInstr()->isPHI())
+      NewOrderPhi.push_back(SU);
+  }
+  std::deque<SUnit *> NewOrderI;
+  for (SUnit *SU : Instrs) {
+    if (!SU->getInstr()->isPHI())
+      orderDependence(SSD, SU, NewOrderI);
+  }
+  llvm::append_range(NewOrderPhi, NewOrderI);
+  return NewOrderPhi;
+}
+
 /// After the schedule has been formed, call this function to combine
 /// the instructions from the 
diff erent stages/cycles.  That is, this
 /// function creates a schedule that represents a single iteration.
@@ -2924,19 +3378,7 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
   // generated code.
   for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
     std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
-    std::deque<SUnit *> newOrderPhi;
-    for (SUnit *SU : cycleInstrs) {
-      if (SU->getInstr()->isPHI())
-        newOrderPhi.push_back(SU);
-    }
-    std::deque<SUnit *> newOrderI;
-    for (SUnit *SU : cycleInstrs) {
-      if (!SU->getInstr()->isPHI())
-        orderDependence(SSD, SU, newOrderI);
-    }
-    // Replace the old order with the new order.
-    cycleInstrs.swap(newOrderPhi);
-    llvm::append_range(cycleInstrs, newOrderI);
+    cycleInstrs = std::move(reorderInstructions(SSD, cycleInstrs));
     SSD->fixupRegisterOverlaps(cycleInstrs);
   }
 

diff  --git a/llvm/test/CodeGen/PowerPC/sms-regpress.mir b/llvm/test/CodeGen/PowerPC/sms-regpress.mir
new file mode 100644
index 00000000000000..f523b4548eecc9
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/sms-regpress.mir
@@ -0,0 +1,328 @@
+# RUN: llc --verify-machineinstrs -mcpu=pwr9 -o - %s -run-pass=pipeliner -ppc-enable-pipeliner  -pipeliner-register-pressure -pipeliner-max-mii=50 -pipeliner-ii-search-range=30 -pipeliner-max-stages=10 -debug-only=pipeliner 2>&1 | FileCheck %s
+
+# Check that if the register pressure is too high, the schedule is rejected, II is incremented, and scheduling continues.
+# The specific value of II is not important.
+
+# CHECK: Try to schedule with 21
+# CHECK: 	Can't schedule
+# CHECK: Try to schedule with 22
+# CHECK: 	Can't schedule
+# CHECK: Try to schedule with 23
+# CHECK: Rejected the schedule because of too high register pressure
+# CHECK: Try to schedule with 24
+# CHECK: Rejected the schedule because of too high register pressure
+# CHECK: Try to schedule with 25
+# CHECK: Rejected the schedule because of too high register pressure
+# CHECK: Try to schedule with 26
+# CHECK: Schedule Found? 1 (II=26)
+
+--- |
+  ; ModuleID = 'a.ll'
+  source_filename = "a.c"
+  target datalayout = "e-m:e-Fn32-i64:64-n32:64"
+  target triple = "ppc64le"
+  
+  ; Function Attrs: nofree nosync nounwind memory(argmem: read) uwtable
+  define dso_local double @kernel(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef signext %n) local_unnamed_addr #0 {
+  entry:
+    %0 = load double, ptr %a, align 8, !tbaa !3
+    %arrayidx1 = getelementptr inbounds double, ptr %a, i64 1
+    %1 = load double, ptr %arrayidx1, align 8, !tbaa !3
+    %cmp163 = icmp sgt i32 %n, 0
+    br i1 %cmp163, label %for.body.preheader, label %for.cond.cleanup
+  
+  for.body.preheader:                               ; preds = %entry
+    %wide.trip.count = zext i32 %n to i64
+    %scevgep1 = getelementptr i8, ptr %b, i64 -8
+    call void @llvm.set.loop.iterations.i64(i64 %wide.trip.count)
+    br label %for.body
+  
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    %res.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %30, %for.body ]
+    ret double %res.0.lcssa
+  
+  for.body:                                         ; preds = %for.body, %for.body.preheader
+    %res.0165 = phi double [ 0.000000e+00, %for.body.preheader ], [ %30, %for.body ]
+    %2 = phi ptr [ %scevgep1, %for.body.preheader ], [ %3, %for.body ]
+    %3 = getelementptr i8, ptr %2, i64 8
+    %4 = load double, ptr %3, align 8, !tbaa !3
+    %5 = tail call double @llvm.fmuladd.f64(double %0, double %4, double %0)
+    %6 = tail call double @llvm.fmuladd.f64(double %5, double %4, double %5)
+    %7 = tail call double @llvm.fmuladd.f64(double %6, double %4, double %6)
+    %8 = tail call double @llvm.fmuladd.f64(double %7, double %4, double %7)
+    %9 = tail call double @llvm.fmuladd.f64(double %8, double %4, double %8)
+    %10 = tail call double @llvm.fmuladd.f64(double %9, double %4, double %9)
+    %11 = tail call double @llvm.fmuladd.f64(double %10, double %4, double %10)
+    %12 = tail call double @llvm.fmuladd.f64(double %11, double %4, double %11)
+    %13 = tail call double @llvm.fmuladd.f64(double %12, double %4, double %12)
+    %14 = tail call double @llvm.fmuladd.f64(double %13, double %4, double %13)
+    %15 = tail call double @llvm.fmuladd.f64(double %14, double %4, double %14)
+    %16 = tail call double @llvm.fmuladd.f64(double %15, double %4, double %15)
+    %17 = tail call double @llvm.fmuladd.f64(double %16, double %4, double %16)
+    %18 = tail call double @llvm.fmuladd.f64(double %17, double %4, double %17)
+    %19 = tail call double @llvm.fmuladd.f64(double %18, double %4, double %18)
+    %20 = tail call double @llvm.fmuladd.f64(double %19, double %4, double %19)
+    %add = fadd double %19, %20
+    %21 = tail call double @llvm.fmuladd.f64(double %20, double %4, double %add)
+    %add35 = fadd double %12, %21
+    %22 = tail call double @llvm.fmuladd.f64(double %5, double %4, double %add35)
+    %add38 = fadd double %13, %22
+    %23 = tail call double @llvm.fmuladd.f64(double %6, double %4, double %add38)
+    %mul = fmul double %4, %7
+    %mul46 = fmul double %mul, %14
+    %24 = tail call double @llvm.fmuladd.f64(double %mul46, double %13, double %16)
+    %mul50 = fmul double %4, %9
+    %mul51 = fmul double %1, %mul50
+    %25 = tail call double @llvm.fmuladd.f64(double %mul51, double %11, double %24)
+    %add53 = fadd double %5, %25
+    %add54 = fadd double %6, %add53
+    %mul55 = fmul double %14, %16
+    %mul56 = fmul double %mul55, %17
+    %mul57 = fmul double %mul56, %18
+    %26 = tail call double @llvm.fmuladd.f64(double %mul57, double %19, double %add54)
+    %27 = tail call double @llvm.fmuladd.f64(double %10, double %1, double %26)
+    %28 = tail call double @llvm.fmuladd.f64(double %8, double %6, double %27)
+    %mul61 = fmul double %20, %21
+    %mul62 = fmul double %mul61, %22
+    %29 = tail call double @llvm.fmuladd.f64(double %mul62, double %23, double %28)
+    %mul64 = fmul double %26, %29
+    %mul65 = fmul double %24, %mul64
+    %mul66 = fmul double %12, %mul65
+    %30 = tail call double @llvm.fmuladd.f64(double %mul66, double %10, double %res.0165)
+    %31 = call i1 @llvm.loop.decrement.i64(i64 1)
+    br i1 %31, label %for.body, label %for.cond.cleanup, !llvm.loop !7
+  }
+  
+  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+  declare double @llvm.fmuladd.f64(double, double, double) #1
+  
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare void @llvm.set.loop.iterations.i64(i64) #2
+  
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i1 @llvm.loop.decrement.i64(i64) #2
+  
+  attributes #0 = { nofree nosync nounwind memory(argmem: read) uwtable "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crbits,+crypto,+direct-move,+extdiv,+htm,+isa-v206-instructions,+isa-v207-instructions,+isa-v30-instructions,+power8-vector,+power9-vector,+quadword-atomics,+vsx,-aix-small-local-exec-tls,-privileged,-rop-protect,-spe" }
+  attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+  attributes #2 = { nocallback noduplicate nofree nosync nounwind willreturn }
+  
+  !llvm.module.flags = !{!0, !1}
+  !llvm.ident = !{!2}
+  
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{i32 7, !"uwtable", i32 2}
+  !2 = !{!"clang version 18.0.0 (https://miratech-soft@dev.azure.com/miratech-soft/llvm/_git/llvm c8d01fb665fc5d9378100a6d92ebcd3be49be655)"}
+  !3 = !{!4, !4, i64 0}
+  !4 = !{!"double", !5, i64 0}
+  !5 = !{!"omnipotent char", !6, i64 0}
+  !6 = !{!"Simple C/C++ TBAA"}
+  !7 = distinct !{!7, !8, !9}
+  !8 = !{!"llvm.loop.mustprogress"}
+  !9 = !{!"llvm.loop.unroll.disable"}
+
+...
+---
+name:            kernel
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vsfrc, preferred-register: '' }
+  - { id: 1, class: vsfrc, preferred-register: '' }
+  - { id: 2, class: g8rc, preferred-register: '' }
+  - { id: 3, class: vsfrc, preferred-register: '' }
+  - { id: 4, class: vsfrc, preferred-register: '' }
+  - { id: 5, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 6, class: g8rc, preferred-register: '' }
+  - { id: 7, class: vsfrc, preferred-register: '' }
+  - { id: 8, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 9, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 10, class: g8rc, preferred-register: '' }
+  - { id: 11, class: gprc, preferred-register: '' }
+  - { id: 12, class: vsfrc, preferred-register: '' }
+  - { id: 13, class: crrc, preferred-register: '' }
+  - { id: 14, class: vsfrc, preferred-register: '' }
+  - { id: 15, class: g8rc, preferred-register: '' }
+  - { id: 16, class: g8rc, preferred-register: '' }
+  - { id: 17, class: g8rc, preferred-register: '' }
+  - { id: 18, class: f8rc, preferred-register: '' }
+  - { id: 19, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 20, class: vsfrc, preferred-register: '' }
+  - { id: 21, class: vsfrc, preferred-register: '' }
+  - { id: 22, class: vsfrc, preferred-register: '' }
+  - { id: 23, class: vsfrc, preferred-register: '' }
+  - { id: 24, class: vsfrc, preferred-register: '' }
+  - { id: 25, class: vsfrc, preferred-register: '' }
+  - { id: 26, class: vsfrc, preferred-register: '' }
+  - { id: 27, class: vsfrc, preferred-register: '' }
+  - { id: 28, class: vsfrc, preferred-register: '' }
+  - { id: 29, class: vsfrc, preferred-register: '' }
+  - { id: 30, class: vsfrc, preferred-register: '' }
+  - { id: 31, class: vsfrc, preferred-register: '' }
+  - { id: 32, class: vsfrc, preferred-register: '' }
+  - { id: 33, class: vsfrc, preferred-register: '' }
+  - { id: 34, class: vsfrc, preferred-register: '' }
+  - { id: 35, class: vsfrc, preferred-register: '' }
+  - { id: 36, class: vsfrc, preferred-register: '' }
+  - { id: 37, class: vsfrc, preferred-register: '' }
+  - { id: 38, class: vsfrc, preferred-register: '' }
+  - { id: 39, class: vsfrc, preferred-register: '' }
+  - { id: 40, class: vsfrc, preferred-register: '' }
+  - { id: 41, class: vsfrc, preferred-register: '' }
+  - { id: 42, class: vsfrc, preferred-register: '' }
+  - { id: 43, class: vsfrc, preferred-register: '' }
+  - { id: 44, class: vsfrc, preferred-register: '' }
+  - { id: 45, class: vsfrc, preferred-register: '' }
+  - { id: 46, class: vsfrc, preferred-register: '' }
+  - { id: 47, class: vsfrc, preferred-register: '' }
+  - { id: 48, class: vsfrc, preferred-register: '' }
+  - { id: 49, class: vsfrc, preferred-register: '' }
+  - { id: 50, class: vsfrc, preferred-register: '' }
+  - { id: 51, class: vsfrc, preferred-register: '' }
+  - { id: 52, class: vsfrc, preferred-register: '' }
+  - { id: 53, class: vsfrc, preferred-register: '' }
+  - { id: 54, class: vsfrc, preferred-register: '' }
+  - { id: 55, class: vsfrc, preferred-register: '' }
+  - { id: 56, class: vsfrc, preferred-register: '' }
+  - { id: 57, class: vsfrc, preferred-register: '' }
+  - { id: 58, class: vsfrc, preferred-register: '' }
+  - { id: 59, class: vsfrc, preferred-register: '' }
+  - { id: 60, class: vsfrc, preferred-register: '' }
+  - { id: 61, class: vsfrc, preferred-register: '' }
+  - { id: 62, class: crbitrc, preferred-register: '' }
+liveins:
+  - { reg: '$x3', virtual-reg: '%8' }
+  - { reg: '$x4', virtual-reg: '%9' }
+  - { reg: '$x5', virtual-reg: '%10' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x50000000), %bb.1(0x30000000)
+    liveins: $x3, $x4, $x5
+  
+    %10:g8rc = COPY killed $x5
+    %9:g8rc_and_g8rc_nox0 = COPY killed $x4
+    %8:g8rc_and_g8rc_nox0 = COPY killed $x3
+    %11:gprc = COPY killed %10.sub_32
+    %13:crrc = CMPWI %11, 0
+    BCC 44, killed %13, %bb.2
+  
+  bb.1:
+    successors: %bb.3(0x80000000)
+  
+    %12:vsfrc = XXLXORdpz
+    B %bb.3
+  
+  bb.2.for.body.preheader:
+    successors: %bb.4(0x80000000)
+  
+    %0:vsfrc = DFLOADf64 0, %8 :: (load (s64) from %ir.a, !tbaa !3)
+    %1:vsfrc = DFLOADf64 8, killed %8 :: (load (s64) from %ir.arrayidx1, !tbaa !3)
+    %16:g8rc = IMPLICIT_DEF
+    %15:g8rc = INSERT_SUBREG killed %16, killed %11, %subreg.sub_32
+    %17:g8rc = RLDICL killed %15, 0, 32
+    %2:g8rc = ADDI8 killed %9, -8
+    MTCTR8loop killed %17, implicit-def dead $ctr8
+    %14:vsfrc = XXLXORdpz
+    B %bb.4
+  
+  bb.3.for.cond.cleanup:
+    %3:vsfrc = PHI %12, %bb.1, %7, %bb.4
+    $f1 = COPY killed %3
+    BLR8 implicit $lr8, implicit $rm, implicit killed $f1
+  
+  bb.4.for.body:
+    successors: %bb.4(0x7c000000), %bb.3(0x04000000)
+  
+    %4:vsfrc = PHI %14, %bb.2, %7, %bb.4
+    %5:g8rc_and_g8rc_nox0 = PHI %2, %bb.2, %6, %bb.4
+    %18:f8rc, %19:g8rc_and_g8rc_nox0 = LFDU 8, killed %5 :: (load (s64) from %ir.3, !tbaa !3)
+    %6:g8rc = COPY killed %19
+    %20:vsfrc = nofpexcept XSMADDADP %0, %0, %18, implicit $rm
+    %21:vsfrc = nofpexcept XSMADDADP %20, %20, %18, implicit $rm
+    %22:vsfrc = nofpexcept XSMADDADP %21, %21, %18, implicit $rm
+    %23:vsfrc = nofpexcept XSMADDADP %22, %22, %18, implicit $rm
+    %24:vsfrc = nofpexcept XSMADDADP %23, %23, %18, implicit $rm
+    %25:vsfrc = nofpexcept XSMADDADP %24, %24, %18, implicit $rm
+    %26:vsfrc = nofpexcept XSMADDADP %25, %25, %18, implicit $rm
+    %27:vsfrc = nofpexcept XSMADDADP %26, %26, %18, implicit $rm
+    %28:vsfrc = nofpexcept XSMADDADP %27, %27, %18, implicit $rm
+    %29:vsfrc = nofpexcept XSMADDADP %28, %28, %18, implicit $rm
+    %30:vsfrc = nofpexcept XSMADDADP %29, %29, %18, implicit $rm
+    %31:vsfrc = nofpexcept XSMADDADP killed %30, %30, %18, implicit $rm
+    %32:vsfrc = nofpexcept XSMADDADP %31, %31, %18, implicit $rm
+    %33:vsfrc = nofpexcept XSMADDADP %32, %32, %18, implicit $rm
+    %34:vsfrc = nofpexcept XSMADDADP %33, %33, %18, implicit $rm
+    %35:vsfrc = nofpexcept XSMADDADP %34, %34, %18, implicit $rm
+    %36:vsfrc = nofpexcept XSADDDP %34, %35, implicit $rm
+    %37:vsfrc = nofpexcept XSMADDADP killed %36, %35, %18, implicit $rm
+    %38:vsfrc = nofpexcept XSADDDP %27, %37, implicit $rm
+    %39:vsfrc = nofpexcept XSMADDADP killed %38, %20, %18, implicit $rm
+    %40:vsfrc = nofpexcept XSADDDP %28, %39, implicit $rm
+    %41:vsfrc = nofpexcept XSMADDADP killed %40, %21, %18, implicit $rm
+    %42:vsfrc = nofpexcept XSMULDP %18, killed %22, implicit $rm
+    %43:vsfrc = nofpexcept XSMULDP killed %42, %29, implicit $rm
+    %44:vsfrc = nofpexcept XSMADDADP %31, killed %43, killed %28, implicit $rm
+    %45:vsfrc = nofpexcept XSMULDP killed %18, killed %24, implicit $rm
+    %46:vsfrc = nofpexcept XSMULDP %1, killed %45, implicit $rm
+    %47:vsfrc = nofpexcept XSMADDADP %44, killed %46, killed %26, implicit $rm
+    %48:vsfrc = nofpexcept XSADDDP killed %20, killed %47, implicit $rm
+    %49:vsfrc = nofpexcept XSADDDP %21, killed %48, implicit $rm
+    %50:vsfrc = nofpexcept XSMULDP killed %29, killed %31, implicit $rm
+    %51:vsfrc = nofpexcept XSMULDP killed %50, killed %32, implicit $rm
+    %52:vsfrc = nofpexcept XSMULDP killed %51, killed %33, implicit $rm
+    %53:vsfrc = nofpexcept XSMADDADP killed %49, killed %52, killed %34, implicit $rm
+    %54:vsfrc = nofpexcept XSMADDADP %53, %25, %1, implicit $rm
+    %55:vsfrc = nofpexcept XSMADDADP killed %54, killed %23, killed %21, implicit $rm
+    %56:vsfrc = nofpexcept XSMULDP killed %35, killed %37, implicit $rm
+    %57:vsfrc = nofpexcept XSMULDP killed %56, killed %39, implicit $rm
+    %58:vsfrc = nofpexcept XSMADDADP killed %55, killed %57, killed %41, implicit $rm
+    %59:vsfrc = nofpexcept XSMULDP killed %53, killed %58, implicit $rm
+    %60:vsfrc = nofpexcept XSMULDP killed %44, killed %59, implicit $rm
+    %61:vsfrc = nofpexcept XSMULDP killed %27, killed %60, implicit $rm
+    %7:vsfrc = nofpexcept XSMADDADP killed %4, killed %61, killed %25, implicit $rm
+    BDNZ8 %bb.4, implicit-def $ctr8, implicit $ctr8
+    B %bb.3
+
+...