[llvm] 28d09bb - [CodeGen][ARM] Enable Swing Module Scheduling for ARM

Thu Apr 28 13:01:45 PDT 2022

Author: David Penry
Date: 2022-04-28T13:01:18-07:00
New Revision: 28d09bbbc3d09c912b54a4d5edb32cab7de32a6f

URL: https://github.com/llvm/llvm-project/commit/28d09bbbc3d09c912b54a4d5edb32cab7de32a6f
DIFF: https://github.com/llvm/llvm-project/commit/28d09bbbc3d09c912b54a4d5edb32cab7de32a6f.diff

LOG: [CodeGen][ARM] Enable Swing Module Scheduling for ARM

This patch permits Swing Modulo Scheduling for ARM targets
turns it on by default for the Cortex-M7.  The t2Bcc
instruction is recognized as a loop-ending branch.

MachinePipeliner is extended by adding support for
"unpipelineable" instructions.  These instructions are
those which contribute to the loop exit test; in the SMS
papers they are removed before creating the dependence graph
and then inserted into the final schedule of the kernel and
prologues. Support for these instructions was not previously
necessary because current targets supporting SMS have only
supported it for hardware loop branches, which have no
loop-exit-contributing instructions in the loop body.

The current structure of the MachinePipeliner makes it difficult
to remove/exclude these instructions from the dependence graph.
Therefore, this patch leaves them in the graph, but adds a
"normalization" method which moves them in the schedule to
stage 0, which causes them to appear properly in kernel and
prologues.

It was also necessary to be more careful about boundary nodes
when iterating across successors in the dependence graph because
the loop exit branch is now a non-artificial successor to
instructions in the graph. In additional, schedules with physical
use/def pairs in the same cycle should be treated as creating an
invalid schedule because the scheduling logic doesn't respect
physical register dependence once scheduled to the same cycle.

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D122672

Added: 
    llvm/test/CodeGen/Thumb2/swp-exitbranchdir.mir
    llvm/test/CodeGen/Thumb2/swp-fixedii.mir

Modified: 
    llvm/include/llvm/CodeGen/MachinePipeliner.h
    llvm/include/llvm/CodeGen/ModuloSchedule.h
    llvm/lib/CodeGen/MachinePipeliner.cpp
    llvm/lib/CodeGen/ModuloSchedule.cpp
    llvm/lib/Target/ARM/ARM.td
    llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
    llvm/lib/Target/ARM/ARMBaseInstrInfo.h
    llvm/lib/Target/ARM/ARMSubtarget.cpp
    llvm/lib/Target/ARM/ARMSubtarget.h
    llvm/lib/Target/ARM/ARMTargetMachine.cpp
    llvm/test/CodeGen/ARM/O3-pipeline.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index 170f20182001f..4559f7a9bde78 100644

--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -84,6 +84,8 @@ class MachinePipeliner : public MachineFunctionPass {
     SmallVector<MachineOperand, 4> BrCond;
     MachineInstr *LoopInductionVar = nullptr;
     MachineInstr *LoopCompare = nullptr;
+    std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopPipelinerInfo =
+        nullptr;
   };
   LoopInfo LI;
 
@@ -119,6 +121,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
   LiveIntervals &LIS;
   const RegisterClassInfo &RegClassInfo;
   unsigned II_setByPragma = 0;
+  TargetInstrInfo::PipelinerLoopInfo *LoopPipelinerInfo = nullptr;
 
   /// A toplogical ordering of the SUnits, which is needed for changing
   /// dependences and iterating over the SUnits.
@@ -196,9 +199,11 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
 
 public:
   SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
-                    const RegisterClassInfo &rci, unsigned II)
+                    const RegisterClassInfo &rci, unsigned II,
+                    TargetInstrInfo::PipelinerLoopInfo *PLI)
       : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
-        RegClassInfo(rci), II_setByPragma(II), Topo(SUnits, &ExitSU) {
+        RegClassInfo(rci), II_setByPragma(II), LoopPipelinerInfo(PLI),
+        Topo(SUnits, &ExitSU) {
     P.MF->getSubtarget().getSMSMutations(Mutations);
     if (SwpEnableCopyToPhi)
       Mutations.push_back(std::make_unique<CopyToPhiMutation>());
@@ -589,6 +594,13 @@ class SMSchedule {
     return ScheduledInstrs[cycle];
   }
 
+  SmallSet<SUnit *, 8>
+  computeUnpipelineableNodes(SwingSchedulerDAG *SSD,
+                             TargetInstrInfo::PipelinerLoopInfo *PLI);
+
+  bool
+  normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD,
+                                    TargetInstrInfo::PipelinerLoopInfo *PLI);
   bool isValidSchedule(SwingSchedulerDAG *SSD);
   void finalizeSchedule(SwingSchedulerDAG *SSD);
   void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,

diff  --git a/llvm/include/llvm/CodeGen/ModuloSchedule.h b/llvm/include/llvm/CodeGen/ModuloSchedule.h
index 4045df807cdb3..c515101e80fdf 100644
--- a/llvm/include/llvm/CodeGen/ModuloSchedule.h
+++ b/llvm/include/llvm/CodeGen/ModuloSchedule.h
@@ -191,8 +191,8 @@ class ModuloScheduleExpander {
   void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB,
                       ValueMapTy *VRMap, MBBVectorTy &PrologBBs);
   void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB,
-                      ValueMapTy *VRMap, MBBVectorTy &EpilogBBs,
-                      MBBVectorTy &PrologBBs);
+                      MachineBasicBlock *OrigBB, ValueMapTy *VRMap,
+                      MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
   void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
                             MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
                             ValueMapTy *VRMap, InstrMapTy &InstrMap,

diff  --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 0bffa9154fc4b..9ea6e9b981721 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -255,6 +255,7 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
              << "Failed to pipeline loop";
     });
 
+    LI.LoopPipelinerInfo.reset();
     return Changed;
   }
 
@@ -262,6 +263,7 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
 
   Changed = swingModuloScheduler(L);
 
+  LI.LoopPipelinerInfo.reset();
   return Changed;
 }
 
@@ -354,7 +356,8 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
 
   LI.LoopInductionVar = nullptr;
   LI.LoopCompare = nullptr;
-  if (!TII->analyzeLoopForPipelining(L.getTopBlock())) {
+  LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(L.getTopBlock());
+  if (!LI.LoopPipelinerInfo) {
     LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
     NumFailLoop++;
     ORE->emit([&]() {
@@ -419,7 +422,7 @@ bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
   assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
 
   SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
-                        II_setByPragma);
+                        II_setByPragma, LI.LoopPipelinerInfo.get());
 
   MachineBasicBlock *MBB = L.getHeader();
   // The kernel should not include any terminator instructions.  These
@@ -1422,7 +1425,7 @@ void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
 /// We ignore the back-edge recurrence in order to avoid unbounded recursion
 /// in the calculation of the ASAP, ALAP, etc functions.
 static bool ignoreDependence(const SDep &D, bool isPred) {
-  if (D.isArtificial())
+  if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
     return true;
   return D.getKind() == SDep::Anti && isPred;
 }
@@ -1471,6 +1474,8 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
     SUnit *SU = &SUnits[I];
     for (const SDep &S : SU->Succs) {
       SUnit *succ = S.getSUnit();
+      if (succ->isBoundaryNode())
+        continue;
       if (S.getLatency() == 0)
         zeroLatencyHeight =
             std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
@@ -1788,7 +1793,8 @@ void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
   NodesAdded.insert(SU);
   for (auto &SI : SU->Succs) {
     SUnit *Successor = SI.getSUnit();
-    if (!SI.isArtificial() && NodesAdded.count(Successor) == 0)
+    if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
+        NodesAdded.count(Successor) == 0)
       addConnectedNodes(Successor, NewSet, NodesAdded);
   }
   for (auto &PI : SU->Preds) {
@@ -2080,6 +2086,11 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
       });
     } while (++NI != NE && scheduleFound);
 
+    // If a schedule is found, ensure non-pipelined instructions are in stage 0
+    if (scheduleFound)
+      scheduleFound =
+          Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
+
     // If a schedule is found, check if it is a valid schedule too.
     if (scheduleFound)
       scheduleFound = Schedule.isValidSchedule(this);
@@ -2263,7 +2274,7 @@ MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
 bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
                                          bool isSucc) {
   if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
-      Dep.isArtificial())
+      Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
     return false;
 
   if (!SwpPruneLoopCarried)
@@ -2430,7 +2441,7 @@ int SMSchedule::latestCycleInChain(const SDep &Dep) {
   while (!Worklist.empty()) {
     const SDep &Cur = Worklist.pop_back_val();
     SUnit *SuccSU = Cur.getSUnit();
-    if (Visited.count(SuccSU))
+    if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
       continue;
     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
     if (it == InstrToCycle.end())
@@ -2697,21 +2708,91 @@ bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
   return false;
 }
 
+/// Determine transitive dependences of unpipelineable instructions
+SmallSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes(
+    SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
+  SmallSet<SUnit *, 8> DoNotPipeline;
+  SmallVector<SUnit *, 8> Worklist;
+
+  for (auto &SU : SSD->SUnits)
+    if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
+      Worklist.push_back(&SU);
+
+  while (!Worklist.empty()) {
+    auto SU = Worklist.pop_back_val();
+    if (DoNotPipeline.count(SU))
+      continue;
+    LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
+    DoNotPipeline.insert(SU);
+    for (auto &Dep : SU->Preds)
+      Worklist.push_back(Dep.getSUnit());
+    if (SU->getInstr()->isPHI())
+      for (auto &Dep : SU->Succs)
+        if (Dep.getKind() == SDep::Anti)
+          Worklist.push_back(Dep.getSUnit());
+  }
+  return DoNotPipeline;
+}
+
+// Determine all instructions upon which any unpipelineable instruction depends
+// and ensure that they are in stage 0.  If unable to do so, return false.
+bool SMSchedule::normalizeNonPipelinedInstructions(
+    SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
+  SmallSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI);
+
+  int NewLastCycle = INT_MIN;
+  for (SUnit &SU : SSD->SUnits) {
+    if (!SU.isInstr())
+      continue;
+    if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
+      NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
+      continue;
+    }
+
+    // Put the non-pipelined instruction as early as possible in the schedule
+    int NewCycle = getFirstCycle();
+    for (auto &Dep : SU.Preds)
+      NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
+
+    int OldCycle = InstrToCycle[&SU];
+    if (OldCycle != NewCycle) {
+      InstrToCycle[&SU] = NewCycle;
+      auto &OldS = getInstructions(OldCycle);
+      OldS.erase(std::remove(OldS.begin(), OldS.end(), &SU), OldS.end());
+      getInstructions(NewCycle).emplace_back(&SU);
+      LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
+                        << ") is not pipelined; moving from cycle " << OldCycle
+                        << " to " << NewCycle << " Instr:" << *SU.getInstr());
+    }
+    NewLastCycle = std::max(NewLastCycle, NewCycle);
+  }
+  LastCycle = NewLastCycle;
+  return true;
+}
+
 // Check if the generated schedule is valid. This function checks if
 // an instruction that uses a physical register is scheduled in a
 // 
diff erent stage than the definition. The pipeliner does not handle
 // physical register values that may cross a basic block boundary.
+// Furthermore, if a physical def/use pair is assigned to the same
+// cycle, orderDependence does not guarantee def/use ordering, so that
+// case should be considered invalid.  (The test checks for both
+// earlier and same-cycle use to be more robust.)
 bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
   for (SUnit &SU : SSD->SUnits) {
     if (!SU.hasPhysRegDefs)
       continue;
     int StageDef = stageScheduled(&SU);
+    int CycleDef = InstrToCycle[&SU];
     assert(StageDef != -1 && "Instruction should have been scheduled.");
     for (auto &SI : SU.Succs)
-      if (SI.isAssignedRegDep())
-        if (Register::isPhysicalRegister(SI.getReg()))
+      if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
+        if (Register::isPhysicalRegister(SI.getReg())) {
           if (stageScheduled(SI.getSUnit()) != StageDef)
             return false;
+          if (InstrToCycle[SI.getSUnit()] <= CycleDef)
+            return false;
+        }
   }
   return true;
 }

diff  --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index b974fa9846f25..20aecdf222e26 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -158,7 +158,7 @@ void ModuloScheduleExpander::generatePipelinedLoop() {
 
   SmallVector<MachineBasicBlock *, 4> EpilogBBs;
   // Generate the epilog instructions to complete the pipeline.
-  generateEpilog(MaxStageCount, KernelBB, VRMap, EpilogBBs, PrologBBs);
+  generateEpilog(MaxStageCount, KernelBB, BB, VRMap, EpilogBBs, PrologBBs);
 
   // We need this step because the register allocation doesn't handle some
   // situations well, so we insert copies to help out.
@@ -240,11 +240,9 @@ void ModuloScheduleExpander::generateProlog(unsigned LastStage,
 /// Generate the pipeline epilog code. The epilog code finishes the iterations
 /// that were started in either the prolog or the kernel.  We create a basic
 /// block for each stage that needs to complete.
-void ModuloScheduleExpander::generateEpilog(unsigned LastStage,
-                                            MachineBasicBlock *KernelBB,
-                                            ValueMapTy *VRMap,
-                                            MBBVectorTy &EpilogBBs,
-                                            MBBVectorTy &PrologBBs) {
+void ModuloScheduleExpander::generateEpilog(
+    unsigned LastStage, MachineBasicBlock *KernelBB, MachineBasicBlock *OrigBB,
+    ValueMapTy *VRMap, MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs) {
   // We need to change the branch from the kernel to the first epilog block, so
   // this call to analyze branch uses the kernel rather than the original BB.
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
@@ -314,7 +312,12 @@ void ModuloScheduleExpander::generateEpilog(unsigned LastStage,
   // Create a branch to the new epilog from the kernel.
   // Remove the original branch and add a new branch to the epilog.
   TII->removeBranch(*KernelBB);
-  TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
+  assert((OrigBB == TBB || OrigBB == FBB) &&
+         "Unable to determine looping branch direction");
+  if (OrigBB != TBB)
+    TII->insertBranch(*KernelBB, EpilogStart, KernelBB, Cond, DebugLoc());
+  else
+    TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
   // Add a branch to the loop exit.
   if (EpilogBBs.size() > 0) {
     MachineBasicBlock *LastEpilogBB = EpilogBBs.back();

diff  --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index d9bc2827f7d29..4bcba38efe02d 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -494,6 +494,10 @@ def FeatureNoNegativeImmediates
 def FeatureUseMISched: SubtargetFeature<"use-misched", "UseMISched", "true",
                                         "Use the MachineScheduler">;
 
+// Use the MachinePipeliner for instruction scheduling for the subtarget.
+def FeatureUseMIPipeliner: SubtargetFeature<"use-mipipeliner", "UseMIPipeliner", "true",
+                                            "Use the MachinePipeliner">;
+
 // False if scheduling should happen again after register allocation.
 def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler",
     "DisablePostRAScheduler", "true",
@@ -1395,6 +1399,7 @@ def : ProcessorModel<"cortex-m4", CortexM4Model,        [ARMv7em,
 def : ProcessorModel<"cortex-m7", CortexM7Model,        [ARMv7em,
                                                          ProcM7,
                                                          FeatureFPARMv8_D16,
+                                                         FeatureUseMIPipeliner,
                                                          FeatureUseMISched]>;
 
 def : ProcNoItin<"cortex-m23",                          [ARMv8mBaseline,

diff  --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 9bb9df536b2ee..28dd630ed23ba 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -6721,3 +6721,78 @@ unsigned llvm::getBLXpredOpcode(const MachineFunction &MF) {
   return (MF.getSubtarget<ARMSubtarget>().hardenSlsBlr()) ? ARM::BLX_pred_noip
                                                           : ARM::BLX_pred;
 }
+
+namespace {
+class ARMPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
+  MachineInstr *Loop, *EndLoop, *LoopCount;
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+
+  // Meanings of the various stuff with loop types:
+  // t2Bcc:
+  //   Loop = null -- there is no setup.
+  //   EndLoop = branch at end of original BB that will become a kernel
+  //   LoopCount = CC setter live into branch
+public:
+  ARMPipelinerLoopInfo(MachineInstr *Loop, MachineInstr *EndLoop,
+                       MachineInstr *LoopCount)
+      : Loop(Loop), EndLoop(EndLoop), LoopCount(LoopCount),
+        MF(EndLoop->getParent()->getParent()),
+        TII(MF->getSubtarget().getInstrInfo()) {}
+
+  bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
+    // Only ignore the terminator.
+    return MI == EndLoop || MI == LoopCount;
+  }
+
+  Optional<bool> createTripCountGreaterCondition(
+      int TC, MachineBasicBlock &MBB,
+      SmallVectorImpl<MachineOperand> &Cond) override {
+
+    if (isCondBranchOpcode(EndLoop->getOpcode())) {
+      Cond.push_back(EndLoop->getOperand(1));
+      Cond.push_back(EndLoop->getOperand(2));
+      if (EndLoop->getOperand(0).getMBB() == EndLoop->getParent()) {
+        TII->reverseBranchCondition(Cond);
+      }
+      return {};
+    } else
+      llvm_unreachable("Unknown EndLoop");
+  }
+
+  void setPreheader(MachineBasicBlock *NewPreheader) override {}
+
+  void adjustTripCount(int TripCountAdjust) override {}
+
+  void disposed() override {}
+};
+} // namespace
+
+std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+ARMBaseInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
+  MachineBasicBlock::iterator I = LoopBB->getFirstTerminator();
+  MachineBasicBlock *Preheader = *LoopBB->pred_begin();
+  if (Preheader == LoopBB)
+    Preheader = *std::next(LoopBB->pred_begin());
+
+  if (I != LoopBB->end() && I->getOpcode() == ARM::t2Bcc) {
+    // If the branch is a Bcc, then the CPSR should be set somewhere within the
+    // block.  We need to determine the reaching definition of CPSR so that
+    // it can be marked as non-pipelineable, allowing the pipeliner to force
+    // it into stage 0 or give up if it cannot or will not do so.
+    MachineInstr *CCSetter = nullptr;
+    for (auto &L : LoopBB->instrs()) {
+      if (L.isCall())
+        return nullptr;
+      if (isCPSRDefined(L))
+        CCSetter = &L;
+    }
+    if (CCSetter)
+      return std::make_unique<ARMPipelinerLoopInfo>(nullptr, &*I, CCSetter);
+    else
+      return nullptr; // Unable to find the CC setter, so unable to guarantee
+                      // that pipeline will work
+  }
+
+  return nullptr;
+}

diff  --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index ab9643592724d..40acb27d1eb1f 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -372,6 +372,11 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
            MI->getOpcode() == ARM::t2WhileLoopStartTP;
   }
 
+  /// Analyze loop L, which must be a single-basic-block loop, and if the
+  /// conditions can be understood enough produce a PipelinerLoopInfo object.
+  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+  analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override;
+
 private:
   /// Returns an unused general-purpose register which can be used for
   /// constructing an outlined call if one exists. Returns 0 otherwise.

diff  --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index b62f447e8d581..89e5b8762d80f 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -393,6 +393,14 @@ bool ARMSubtarget::enableSubRegLiveness() const {
   return hasMVEIntegerOps();
 }
 
+bool ARMSubtarget::enableMachinePipeliner() const {
+  // Enable the MachinePipeliner before register allocation for subtargets
+  // with the use-mipipeliner feature.
+  return getSchedModel().hasInstrSchedModel() && useMachinePipeliner();
+}
+
+bool ARMSubtarget::useDFAforSMS() const { return false; }
+
 // This overrides the PostRAScheduler bit in the SchedModel for any CPU.
 bool ARMSubtarget::enablePostRAScheduler() const {
   if (enableMachineScheduler())

diff  --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index d426157c54536..8d56c70e80949 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -417,6 +417,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   bool isRWPI() const;
 
   bool useMachineScheduler() const { return UseMISched; }
+  bool useMachinePipeliner() const { return UseMIPipeliner; }
   bool hasMinSize() const { return OptMinSize; }
   bool isThumb1Only() const { return isThumb() && !hasThumb2(); }
   bool isThumb2() const { return isThumb() && hasThumb2(); }
@@ -465,6 +466,10 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   /// Returns true if machine scheduler should be enabled.
   bool enableMachineScheduler() const override;
 
+  /// Returns true if machine pipeliner should be enabled.
+  bool enableMachinePipeliner() const override;
+  bool useDFAforSMS() const override;
+
   /// True for some subtargets at > -O0.
   bool enablePostRAScheduler() const override;
 

diff  --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 401a00841747d..dadf7a557238c 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -509,6 +509,9 @@ bool ARMPassConfig::addGlobalInstructionSelect() {
 
 void ARMPassConfig::addPreRegAlloc() {
   if (getOptLevel() != CodeGenOpt::None) {
+    if (getOptLevel() == CodeGenOpt::Aggressive)
+      addPass(&MachinePipelinerID);
+
     addPass(createMVETPAndVPTOptimisationsPass());
 
     addPass(createMLxExpansionPass());

diff  --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 10c56a3c495b5..6e55134581113 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -95,6 +95,13 @@
 ; CHECK-NEXT:      Peephole Optimizations
 ; CHECK-NEXT:      Remove dead machine instructions
 ; CHECK-NEXT:      MachineDominator Tree Construction
+; CHECK-NEXT:      Slot index numbering
+; CHECK-NEXT:      Live Interval Analysis
+; CHECK-NEXT:      Lazy Machine Block Frequency Analysis
+; CHECK-NEXT:      Machine Optimization Remark Emitter
+; CHECK-NEXT:      Modulo Software Pipelining
+; CHECK-NEXT:      MachineDominator Tree Construction
+; CHECK-NEXT:      Machine Natural Loop Construction
 ; CHECK-NEXT:      MVE TailPred and VPT Optimisation Pass
 ; CHECK-NEXT:      ARM MLA / MLS expansion pass
 ; CHECK-NEXT:      MachineDominator Tree Construction

diff  --git a/llvm/test/CodeGen/Thumb2/swp-exitbranchdir.mir b/llvm/test/CodeGen/Thumb2/swp-exitbranchdir.mir
new file mode 100644
index 0000000000000..3f3ff5e4bd290
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/swp-exitbranchdir.mir
@@ -0,0 +1,203 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv7m-none-eabi -mcpu=cortex-m7 -run-pass=pipeliner -o - %s | FileCheck %s --check-prefix=CHECK
+
+--- |
+  define hidden float @dot(float* nocapture noundef readonly %a, float* nocapture noundef readonly %b, i32 noundef %sz) local_unnamed_addr #0 {
+  entry:
+    %cmp8 = icmp sgt i32 %sz, 0
+    br i1 %cmp8, label %for.body.preheader, label %for.end
+
+  for.body.preheader:                               ; preds = %entry
+    %scevgep = getelementptr float, float* %b, i32 -1
+    %scevgep4 = getelementptr float, float* %a, i32 -1
+    br label %for.body
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %lsr.iv5 = phi float* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
+    %lsr.iv1 = phi float* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
+    %lsr.iv = phi i32 [ %sz, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+    %sum.010 = phi float [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+    %scevgep7 = getelementptr float, float* %lsr.iv5, i32 1
+    %0 = load float, float* %scevgep7, align 4
+    %scevgep3 = getelementptr float, float* %lsr.iv1, i32 1
+    %1 = load float, float* %scevgep3, align 4
+    %mul = fmul fast float %1, %0
+    %add = fadd fast float %mul, %sum.010
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %scevgep2 = getelementptr float, float* %lsr.iv1, i32 1
+    %scevgep6 = getelementptr float, float* %lsr.iv5, i32 1
+    %exitcond.not = icmp ne i32 %lsr.iv.next, 0
+    br i1 %exitcond.not, label %for.body, label %for.end, !llvm.loop !0
+
+  for.end:                                          ; preds = %for.body, %entry
+    %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+    ret float %sum.0.lcssa
+  }
+
+  !0 = distinct !{!0, !1, !2, !3}
+  !1 = !{!"llvm.loop.mustprogress"}
+  !2 = !{!"llvm.loop.unroll.disable"}
+  !3 = !{!"llvm.loop.pipeline.initiationinterval", i32 3}
+
+...
+---
+name:            dot
+alignment:       2
+tracksRegLiveness: true
+constants:
+  - id:              0
+    value:           'float 0.000000e+00'
+    alignment:       4
+    isTargetSpecific: false
+body:             |
+  ; CHECK-LABEL: name: dot
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.2(0x50000000), %bb.1(0x30000000)
+  ; CHECK-NEXT:   liveins: $r0, $r1, $r2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnopc = COPY $r2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gprnopc = COPY $r1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gprnopc = COPY $r0
+  ; CHECK-NEXT:   t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.2, 10 /* CC::ge */, $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[VLDRS:%[0-9]+]]:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool)
+  ; CHECK-NEXT:   t2B %bb.4, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.for.body.preheader:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[COPY1]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gprnopc = COPY [[t2SUBri]]
+  ; CHECK-NEXT:   [[t2SUBri1:%[0-9]+]]:rgpr = t2SUBri [[COPY2]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS1:%[0-9]+]]:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gprnopc = COPY [[t2SUBri1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5.for.body:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.9(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY4]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS2:%[0-9]+]]:spr = VLDRS [[COPY4]], 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep7)
+  ; CHECK-NEXT:   [[t2ADDri1:%[0-9]+]]:rgpr = t2ADDri [[COPY3]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS3:%[0-9]+]]:spr = VLDRS [[COPY3]], 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep3)
+  ; CHECK-NEXT:   [[VMULS:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS3]], [[VLDRS2]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   [[t2SUBri2:%[0-9]+]]:rgpr = t2SUBri [[COPY]], 1, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gprnopc = COPY [[t2SUBri2]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gprnopc = COPY [[t2ADDri1]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gprnopc = COPY [[t2ADDri]]
+  ; CHECK-NEXT:   t2Bcc %bb.9, 0 /* CC::eq */, $cpsr
+  ; CHECK-NEXT:   t2B %bb.6, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6.for.body:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000), %bb.8(0x00000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[t2ADDri2:%[0-9]+]]:rgpr = t2ADDri [[COPY7]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS4:%[0-9]+]]:spr = VLDRS [[COPY7]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep7, align 4)
+  ; CHECK-NEXT:   [[t2ADDri3:%[0-9]+]]:rgpr = t2ADDri [[COPY6]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS5:%[0-9]+]]:spr = VLDRS [[COPY6]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep3, align 4)
+  ; CHECK-NEXT:   [[VMULS1:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS5]], [[VLDRS4]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   [[t2SUBri3:%[0-9]+]]:rgpr = t2SUBri [[COPY5]], 1, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:gpr = COPY [[t2SUBri3]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:gpr = COPY [[t2ADDri3]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:gpr = COPY [[t2ADDri2]]
+  ; CHECK-NEXT:   t2Bcc %bb.8, 0 /* CC::eq */, $cpsr
+  ; CHECK-NEXT:   t2B %bb.7, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7.for.body:
+  ; CHECK-NEXT:   successors: %bb.8(0x04000000), %bb.7(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gprnopc = PHI [[COPY10]], %bb.6, %49, %bb.7
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gprnopc = PHI [[COPY9]], %bb.6, %50, %bb.7
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:gprnopc = PHI [[COPY8]], %bb.6, %51, %bb.7
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.6, %43, %bb.7
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:spr = PHI [[VMULS1]], %bb.6, %52, %bb.7
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:spr = PHI [[VMULS]], %bb.6, [[PHI4]], %bb.7
+  ; CHECK-NEXT:   [[VADDS:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI5]], [[PHI3]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   [[t2SUBri4:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 1, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   [[VLDRS6:%[0-9]+]]:spr = VLDRS [[PHI1]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep3, align 4)
+  ; CHECK-NEXT:   [[VLDRS7:%[0-9]+]]:spr = VLDRS [[PHI]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep7, align 4)
+  ; CHECK-NEXT:   [[t2ADDri4:%[0-9]+]]:rgpr = t2ADDri [[PHI]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[t2ADDri5:%[0-9]+]]:rgpr = t2ADDri [[PHI1]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:gpr = COPY [[t2ADDri4]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:gpr = COPY [[t2ADDri5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:gpr = COPY [[t2SUBri4]]
+  ; CHECK-NEXT:   [[VMULS2:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS6]], [[VLDRS7]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   t2Bcc %bb.8, 0 /* CC::eq */, $cpsr
+  ; CHECK-NEXT:   t2B %bb.7, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.6, [[VADDS]], %bb.7
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:spr = PHI [[VMULS1]], %bb.6, [[VMULS2]], %bb.7
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:spr = PHI [[VMULS]], %bb.6, [[PHI4]], %bb.7
+  ; CHECK-NEXT:   [[VADDS1:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI8]], [[PHI6]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.5, [[VADDS1]], %bb.8
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:spr = PHI [[VMULS]], %bb.5, [[PHI7]], %bb.8
+  ; CHECK-NEXT:   [[VADDS2:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI10]], [[PHI9]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   t2B %bb.4, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.for.end:
+  ; CHECK-NEXT:   [[PHI11:%[0-9]+]]:spr = PHI [[VLDRS]], %bb.1, [[VADDS2]], %bb.9
+  ; CHECK-NEXT:   [[VMOVRS:%[0-9]+]]:gpr = VMOVRS [[PHI11]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   $r0 = COPY [[VMOVRS]]
+  ; CHECK-NEXT:   tBX_RET 14 /* CC::al */, $noreg, implicit $r0
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.4(0x30000000)
+    liveins: $r0, $r1, $r2
+
+    %13:gprnopc = COPY $r2
+    %12:gprnopc = COPY $r1
+    %11:gprnopc = COPY $r0
+    t2CMPri %13, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2Bcc %bb.1, 10 /* CC::ge */, $cpsr
+
+  bb.4:
+    successors: %bb.3(0x80000000)
+
+    %14:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool)
+    t2B %bb.3, 14 /* CC::al */, $noreg
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+
+    %16:rgpr = t2SUBri %12, 4, 14 /* CC::al */, $noreg, $noreg
+    %0:gpr = COPY %16
+    %17:rgpr = t2SUBri %11, 4, 14 /* CC::al */, $noreg, $noreg
+    %15:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool)
+    %1:gpr = COPY %17
+
+  bb.2.for.body:
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+
+    %2:gprnopc = PHI %1, %bb.1, %9, %bb.2
+    %3:gprnopc = PHI %0, %bb.1, %8, %bb.2
+    %4:gprnopc = PHI %13, %bb.1, %7, %bb.2
+    %5:spr = PHI %15, %bb.1, %6, %bb.2
+    %18:rgpr = t2ADDri %2, 4, 14 /* CC::al */, $noreg, $noreg
+    %19:spr = VLDRS %2, 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep7)
+    %20:rgpr = t2ADDri %3, 4, 14 /* CC::al */, $noreg, $noreg
+    %21:spr = VLDRS %3, 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep3)
+    %22:spr = nnan ninf nsz arcp contract afn reassoc VMULS killed %21, killed %19, 14 /* CC::al */, $noreg
+    %6:spr = nnan ninf nsz arcp contract afn reassoc VADDS killed %22, %5, 14 /* CC::al */, $noreg
+    %23:rgpr = t2SUBri %4, 1, 14 /* CC::al */, $noreg, def $cpsr
+    %7:gpr = COPY %23
+    %8:gpr = COPY %20
+    %9:gpr = COPY %18
+    t2Bcc %bb.3, 0 /* CC::eq */, $cpsr
+    t2B %bb.2, 14 /* CC::al */, $noreg
+
+  bb.3.for.end:
+    %10:spr = PHI %14, %bb.4, %6, %bb.2
+    %24:gpr = VMOVRS %10, 14 /* CC::al */, $noreg
+    $r0 = COPY %24
+    tBX_RET 14 /* CC::al */, $noreg, implicit $r0
+
+...

diff  --git a/llvm/test/CodeGen/Thumb2/swp-fixedii.mir b/llvm/test/CodeGen/Thumb2/swp-fixedii.mir
new file mode 100644
index 0000000000000..579123c48a1f6
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/swp-fixedii.mir
@@ -0,0 +1,203 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv7m-none-eabi -mcpu=cortex-m7 -run-pass=pipeliner -o - %s | FileCheck %s --check-prefix=CHECK
+
+--- |
+  define hidden float @dot(float* nocapture noundef readonly %a, float* nocapture noundef readonly %b, i32 noundef %sz) local_unnamed_addr #0 {
+  entry:
+    %cmp8 = icmp sgt i32 %sz, 0
+    br i1 %cmp8, label %for.body.preheader, label %for.end
+
+  for.body.preheader:                               ; preds = %entry
+    %scevgep = getelementptr float, float* %b, i32 -1
+    %scevgep4 = getelementptr float, float* %a, i32 -1
+    br label %for.body
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %lsr.iv5 = phi float* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
+    %lsr.iv1 = phi float* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
+    %lsr.iv = phi i32 [ %sz, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+    %sum.010 = phi float [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+    %scevgep7 = getelementptr float, float* %lsr.iv5, i32 1
+    %0 = load float, float* %scevgep7, align 4
+    %scevgep3 = getelementptr float, float* %lsr.iv1, i32 1
+    %1 = load float, float* %scevgep3, align 4
+    %mul = fmul fast float %1, %0
+    %add = fadd fast float %mul, %sum.010
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %scevgep2 = getelementptr float, float* %lsr.iv1, i32 1
+    %scevgep6 = getelementptr float, float* %lsr.iv5, i32 1
+    %exitcond.not = icmp eq i32 %lsr.iv.next, 0
+    br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+  for.end:                                          ; preds = %for.body, %entry
+    %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+    ret float %sum.0.lcssa
+  }
+
+  !0 = distinct !{!0, !1, !2, !3}
+  !1 = !{!"llvm.loop.mustprogress"}
+  !2 = !{!"llvm.loop.unroll.disable"}
+  !3 = !{!"llvm.loop.pipeline.initiationinterval", i32 3}
+
+...
+---
+name:            dot
+alignment:       2
+tracksRegLiveness: true
+constants:
+  - id:              0
+    value:           'float 0.000000e+00'
+    alignment:       4
+    isTargetSpecific: false
+body:             |
+  ; CHECK-LABEL: name: dot
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.2(0x50000000), %bb.1(0x30000000)
+  ; CHECK-NEXT:   liveins: $r0, $r1, $r2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnopc = COPY $r2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gprnopc = COPY $r1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gprnopc = COPY $r0
+  ; CHECK-NEXT:   t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.2, 10 /* CC::ge */, $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[VLDRS:%[0-9]+]]:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool)
+  ; CHECK-NEXT:   t2B %bb.4, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.for.body.preheader:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[COPY1]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gprnopc = COPY [[t2SUBri]]
+  ; CHECK-NEXT:   [[t2SUBri1:%[0-9]+]]:rgpr = t2SUBri [[COPY2]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS1:%[0-9]+]]:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gprnopc = COPY [[t2SUBri1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5.for.body:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.9(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY4]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS2:%[0-9]+]]:spr = VLDRS [[COPY4]], 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep7)
+  ; CHECK-NEXT:   [[t2ADDri1:%[0-9]+]]:rgpr = t2ADDri [[COPY3]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS3:%[0-9]+]]:spr = VLDRS [[COPY3]], 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep3)
+  ; CHECK-NEXT:   [[VMULS:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS3]], [[VLDRS2]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   [[t2SUBri2:%[0-9]+]]:rgpr = t2SUBri [[COPY]], 1, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gprnopc = COPY [[t2SUBri2]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gprnopc = COPY [[t2ADDri1]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gprnopc = COPY [[t2ADDri]]
+  ; CHECK-NEXT:   t2Bcc %bb.9, 0 /* CC::eq */, $cpsr
+  ; CHECK-NEXT:   t2B %bb.6, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6.for.body:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000), %bb.8(0x00000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[t2ADDri2:%[0-9]+]]:rgpr = t2ADDri [[COPY7]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS4:%[0-9]+]]:spr = VLDRS [[COPY7]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep7, align 4)
+  ; CHECK-NEXT:   [[t2ADDri3:%[0-9]+]]:rgpr = t2ADDri [[COPY6]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS5:%[0-9]+]]:spr = VLDRS [[COPY6]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep3, align 4)
+  ; CHECK-NEXT:   [[VMULS1:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS5]], [[VLDRS4]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   [[t2SUBri3:%[0-9]+]]:rgpr = t2SUBri [[COPY5]], 1, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:gpr = COPY [[t2SUBri3]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:gpr = COPY [[t2ADDri3]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:gpr = COPY [[t2ADDri2]]
+  ; CHECK-NEXT:   t2Bcc %bb.8, 0 /* CC::eq */, $cpsr
+  ; CHECK-NEXT:   t2B %bb.7, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7.for.body:
+  ; CHECK-NEXT:   successors: %bb.8(0x04000000), %bb.7(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gprnopc = PHI [[COPY10]], %bb.6, %49, %bb.7
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gprnopc = PHI [[COPY9]], %bb.6, %50, %bb.7
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:gprnopc = PHI [[COPY8]], %bb.6, %51, %bb.7
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.6, %43, %bb.7
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:spr = PHI [[VMULS1]], %bb.6, %52, %bb.7
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:spr = PHI [[VMULS]], %bb.6, [[PHI4]], %bb.7
+  ; CHECK-NEXT:   [[VADDS:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI5]], [[PHI3]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   [[t2SUBri4:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 1, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   [[VLDRS6:%[0-9]+]]:spr = VLDRS [[PHI1]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep3, align 4)
+  ; CHECK-NEXT:   [[VLDRS7:%[0-9]+]]:spr = VLDRS [[PHI]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep7, align 4)
+  ; CHECK-NEXT:   [[t2ADDri4:%[0-9]+]]:rgpr = t2ADDri [[PHI]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[t2ADDri5:%[0-9]+]]:rgpr = t2ADDri [[PHI1]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:gpr = COPY [[t2ADDri4]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:gpr = COPY [[t2ADDri5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:gpr = COPY [[t2SUBri4]]
+  ; CHECK-NEXT:   [[VMULS2:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS6]], [[VLDRS7]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   t2Bcc %bb.7, 1 /* CC::ne */, $cpsr
+  ; CHECK-NEXT:   t2B %bb.8, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.6, [[VADDS]], %bb.7
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:spr = PHI [[VMULS1]], %bb.6, [[VMULS2]], %bb.7
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:spr = PHI [[VMULS]], %bb.6, [[PHI4]], %bb.7
+  ; CHECK-NEXT:   [[VADDS1:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI8]], [[PHI6]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.5, [[VADDS1]], %bb.8
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:spr = PHI [[VMULS]], %bb.5, [[PHI7]], %bb.8
+  ; CHECK-NEXT:   [[VADDS2:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI10]], [[PHI9]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   t2B %bb.4, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.for.end:
+  ; CHECK-NEXT:   [[PHI11:%[0-9]+]]:spr = PHI [[VLDRS]], %bb.1, [[VADDS2]], %bb.9
+  ; CHECK-NEXT:   [[VMOVRS:%[0-9]+]]:gpr = VMOVRS [[PHI11]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   $r0 = COPY [[VMOVRS]]
+  ; CHECK-NEXT:   tBX_RET 14 /* CC::al */, $noreg, implicit $r0
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.4(0x30000000)
+    liveins: $r0, $r1, $r2
+
+    %13:gprnopc = COPY $r2
+    %12:gprnopc = COPY $r1
+    %11:gprnopc = COPY $r0
+    t2CMPri %13, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2Bcc %bb.1, 10 /* CC::ge */, $cpsr
+
+  bb.4:
+    successors: %bb.3(0x80000000)
+
+    %14:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool)
+    t2B %bb.3, 14 /* CC::al */, $noreg
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+
+    %16:rgpr = t2SUBri %12, 4, 14 /* CC::al */, $noreg, $noreg
+    %0:gpr = COPY %16
+    %17:rgpr = t2SUBri %11, 4, 14 /* CC::al */, $noreg, $noreg
+    %15:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool)
+    %1:gpr = COPY %17
+
+  bb.2.for.body:
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+
+    %2:gprnopc = PHI %1, %bb.1, %9, %bb.2
+    %3:gprnopc = PHI %0, %bb.1, %8, %bb.2
+    %4:gprnopc = PHI %13, %bb.1, %7, %bb.2
+    %5:spr = PHI %15, %bb.1, %6, %bb.2
+    %18:rgpr = t2ADDri %2, 4, 14 /* CC::al */, $noreg, $noreg
+    %19:spr = VLDRS %2, 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep7)
+    %20:rgpr = t2ADDri %3, 4, 14 /* CC::al */, $noreg, $noreg
+    %21:spr = VLDRS %3, 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep3)
+    %22:spr = nnan ninf nsz arcp contract afn reassoc VMULS killed %21, killed %19, 14 /* CC::al */, $noreg
+    %6:spr = nnan ninf nsz arcp contract afn reassoc VADDS killed %22, %5, 14 /* CC::al */, $noreg
+    %23:rgpr = t2SUBri %4, 1, 14 /* CC::al */, $noreg, def $cpsr
+    %7:gpr = COPY %23
+    %8:gpr = COPY %20
+    %9:gpr = COPY %18
+    t2Bcc %bb.2, 1 /* CC::ne */, $cpsr
+    t2B %bb.3, 14 /* CC::al */, $noreg
+
+  bb.3.for.end:
+    %10:spr = PHI %14, %bb.4, %6, %bb.2
+    %24:gpr = VMOVRS %10, 14 /* CC::al */, $noreg
+    $r0 = COPY %24
+    tBX_RET 14 /* CC::al */, $noreg, implicit $r0
+
+...