[llvm] [ModuloSchedule] Implement modulo variable expansion for pipelining (PR #65609)

Yuta Mukai via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 6 06:00:05 PDT 2024


https://github.com/ytmukai updated https://github.com/llvm/llvm-project/pull/65609

>From 270811c4ed58f34b3d86fd712988272d8a455584 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Tue, 5 Sep 2023 11:43:06 +0000
Subject: [PATCH 01/11] [ModuloSchedule] Implement modulo variable expansion
 for pipelining

Modulo variable expansion is a technique that resolves overlap of
variable lifetimes by unrolling. The existing implementation solves
it by making a copy by move instruction for processors with ordinary
registers such as Arm and x86. This method may result in a very large
number of move instructions, which can cause performance problems.

Modulo variable expansion is enabled by specifing -pipeliner-mve-cg.
A backend must implement some newly defined interfaces in
PipelinerLoopInfo.
---
 llvm/include/llvm/CodeGen/ModuloSchedule.h  |  65 +++
 llvm/include/llvm/CodeGen/TargetInstrInfo.h |  24 +
 llvm/lib/CodeGen/MachinePipeliner.cpp       |   9 +
 llvm/lib/CodeGen/ModuloSchedule.cpp         | 615 ++++++++++++++++++++
 4 files changed, 713 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/ModuloSchedule.h b/llvm/include/llvm/CodeGen/ModuloSchedule.h
index fd424163f0d19..1a555728a0985 100644
--- a/llvm/include/llvm/CodeGen/ModuloSchedule.h
+++ b/llvm/include/llvm/CodeGen/ModuloSchedule.h
@@ -370,6 +370,71 @@ class PeelingModuloScheduleExpander {
   std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
 };
 
+/// Expand the kernel using modulo variable expansion algorithm (MVE).
+/// It unrolls the kernel enough to avoid overlap of register lifetime.
+class ModuloScheduleExpanderMVE {
+private:
+  using ValueMapTy = DenseMap<unsigned, unsigned>;
+  using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
+  using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
+
+  ModuloSchedule &Schedule;
+  MachineFunction &MF;
+  const TargetSubtargetInfo &ST;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo *TII = nullptr;
+  LiveIntervals &LIS;
+
+  MachineBasicBlock *OrigKernel = nullptr;
+  MachineBasicBlock *OrigPreheader = nullptr;
+  MachineBasicBlock *OrigExit = nullptr;
+  MachineBasicBlock *Check = nullptr;
+  MachineBasicBlock *Prolog = nullptr;
+  MachineBasicBlock *NewKernel = nullptr;
+  MachineBasicBlock *Epilog = nullptr;
+  MachineBasicBlock *NewPreheader = nullptr;
+  MachineBasicBlock *NewExit = nullptr;
+  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
+
+  /// The number of unroll required to avoid overlap of live ranges.
+  /// NumUnroll = 1 means no unrolling.
+  int NumUnroll;
+
+  void calcNumUnroll();
+  void generatePipelinedLoop();
+  void generateProlog(SmallVectorImpl<ValueMapTy> &VRMap);
+  void generatePhi(MachineInstr *OrigMI, int UnrollNum,
+                   SmallVectorImpl<ValueMapTy> &PrologVRMap,
+                   SmallVectorImpl<ValueMapTy> &KernelVRMap,
+                   SmallVectorImpl<ValueMapTy> &PhiVRMap);
+  void generateKernel(SmallVectorImpl<ValueMapTy> &PrologVRMap,
+                      SmallVectorImpl<ValueMapTy> &KernelVRMap);
+  void generateEpilog(SmallVectorImpl<ValueMapTy> &KernelVRMap,
+                      SmallVectorImpl<ValueMapTy> &EpilogVRMap);
+  void mergeRegUsesAfterPipeline(Register OrigReg, Register NewReg);
+
+  MachineInstr *cloneInstr(MachineInstr *OldMI);
+
+  void updateInstrDef(MachineInstr *NewMI, ValueMapTy &VRMap, bool LastDef);
+
+  void generateKernelPhi(Register OrigLoopVal, Register NewLoopVal,
+                         unsigned UnrollNum,
+                         SmallVectorImpl<ValueMapTy> &VRMapProlog,
+                         SmallVectorImpl<ValueMapTy> &VRMapPhi);
+  void updateInstrUse(MachineInstr *MI, int StageNum, int PhaseNum,
+                      SmallVectorImpl<ValueMapTy> &CurVRMap,
+                      SmallVectorImpl<ValueMapTy> *PrevVRMap);
+
+public:
+  ModuloScheduleExpanderMVE(MachineFunction &MF, ModuloSchedule &S,
+                            LiveIntervals &LIS)
+      : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
+        TII(ST.getInstrInfo()), LIS(LIS) {}
+
+  void expand();
+  static bool canApply(MachineLoop &L);
+};
+
 /// Expander that simply annotates each scheduled instruction with a post-instr
 /// symbol that can be consumed by the ModuloScheduleTest pass.
 ///
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 9fd0ebe6956fb..71ff2aff8fb32 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -766,6 +766,20 @@ class TargetInstrInfo : public MCInstrInfo {
     createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
                                     SmallVectorImpl<MachineOperand> &Cond) = 0;
 
+    /// Create a condtion to determine if the remaining trip count represented
+    /// by the loop counter CounterReg is greater than TC. Some instructions
+    /// such as comparisons may be inserted at the bottom of MBB. CounterReg
+    /// must be accessible there.
+    ///
+    /// The definition of the return value is the same as for the variant above.
+    virtual std::optional<bool>
+    createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
+                                    SmallVectorImpl<MachineOperand> &Cond,
+                                    Register CounterReg) {
+      llvm_unreachable(
+          "Target didn't implement createTripCountGreaterCondition");
+    }
+
     /// Modify the loop such that the trip count is
     /// OriginalTC + TripCountAdjust.
     virtual void adjustTripCount(int TripCountAdjust) = 0;
@@ -779,6 +793,16 @@ class TargetInstrInfo : public MCInstrInfo {
     /// Once this function is called, no other functions on this object are
     /// valid; the loop has been removed.
     virtual void disposed() = 0;
+
+    /// Return the initial value of the loop counter.
+    virtual Register getCounterInitReg() {
+      llvm_unreachable("Target didn't implement getCounterInitReg");
+    }
+
+    /// Return the updated value of the loop counter in the original loop.
+    virtual Register getCounterUpdatedReg() {
+      llvm_unreachable("Target didn't implement getCounterUpdatedReg");
+    }
   };
 
   /// Analyze loop L, which must be a single-basic-block loop, and if the
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index b9c6765be445a..fbfb3f99ccef6 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -192,6 +192,10 @@ static cl::opt<int>
                       cl::desc("Margin representing the unused percentage of "
                                "the register pressure limit"));
 
+static cl::opt<bool>
+    MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
+               cl::desc("Use the MVE code generator for software pipelining"));
+
 namespace llvm {
 
 // A command line option to enable the CopyToPhi DAG mutation.
@@ -677,6 +681,11 @@ void SwingSchedulerDAG::schedule() {
   if (ExperimentalCodeGen && NewInstrChanges.empty()) {
     PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
     MSE.expand();
+  }
+  if (MVECodeGen && NewInstrChanges.empty() &&
+      ModuloScheduleExpanderMVE::canApply(Loop)) {
+    ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
+    MSE.expand();
   } else {
     ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
     MSE.expand();
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index bdae94c4e6f88..06641ad9da5fc 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -2096,6 +2096,621 @@ void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
   MSE.cleanup();
 }
 
+MachineInstr *ModuloScheduleExpanderMVE::cloneInstr(MachineInstr *OldMI) {
+  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
+
+  // TODO: Offset information needs to be corrected.
+  NewMI->dropMemRefs(MF);
+
+  return NewMI;
+}
+
+/// Create a dedicated exit for Loop. Exit is the original exit for Loop.
+/// If it is already dedicated exit, return it. Otherwise, insert a new
+/// block between them and return the new block.
+static MachineBasicBlock *createDedicatedExit(MachineBasicBlock *Loop,
+                                              MachineBasicBlock *Exit) {
+  if (Exit->pred_size() == 1)
+    return Exit;
+
+  MachineFunction *MF = Loop->getParent();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+  MachineBasicBlock *NewExit =
+      MF->CreateMachineBasicBlock(Loop->getBasicBlock());
+  MF->insert(Loop->getIterator(), NewExit);
+
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  TII->analyzeBranch(*Loop, TBB, FBB, Cond);
+  if (TBB == Loop)
+    FBB = NewExit;
+  else if (FBB == Loop)
+    TBB = NewExit;
+  else
+    llvm_unreachable("unexpected loop structure");
+  TII->removeBranch(*Loop);
+  TII->insertBranch(*Loop, TBB, FBB, Cond, DebugLoc());
+  Loop->removeSuccessor(Exit);
+  Loop->addSuccessor(NewExit);
+  TII->insertUnconditionalBranch(*NewExit, Exit, DebugLoc());
+  NewExit->addSuccessor(Exit);
+
+  for (MachineInstr &Phi : Exit->phis()) {
+    for (MachineOperand &MO : Phi.operands())
+      if (MO.isMBB() && MO.getMBB() == Loop)
+        MO.setMBB(NewExit);
+  }
+
+  return NewExit;
+}
+
+/// Generate a pipelined loop that is unrolled by using MVE algorithm and any
+/// other necessary blocks. The control flow is modified to execute the
+/// pipelined loop if the trip count satisfies the condition, otherwise the
+/// original loop. The original loop is also used to execute the reminder
+/// iterations which occur due to unrolling.
+void ModuloScheduleExpanderMVE::generatePipelinedLoop() {
+  // The control flow for pipelining with MVE:
+  //
+  // OrigPreheader:
+  //   // The block that is originally the loop preheader
+  //   goto Check
+  //
+  // Check:
+  //   // Check whether the trip count satisfies the requirements to pipeline.
+  //   if (LoopCounter > NumStages + NumUnroll - 2)
+  //     // The minimum number of iterations to pipeline =
+  //     //   iterations executed in prolog/epilog (NumStages-1) +
+  //     //   iterations executed in one kernel run (NumUnroll)
+  //     goto Prolog
+  //   // fallback to the original loop
+  //   goto NewPreheader
+  //
+  // Prolog:
+  //   // All prolog stages. There are no direct branches to the epilogue.
+  //   goto NewKernel
+  //
+  // NewKernel:
+  //   // NumUnroll copies of the kernel
+  //   if (LoopCounter > MVE-1)
+  //     goto NewKernel
+  //   goto Epilog
+  //
+  // Epilog:
+  //   // All epilog stages.
+  //   if (LoopCounter > 0)
+  //     // The remainder is executed in the original loop
+  //     goto NewPreheader
+  //   goto NewExit
+  //
+  // NewPreheader:
+  //   // Newly created preheader for the original loop.
+  //   // The initial values of the phis in the loop are merged from two paths.
+  //   NewInitVal = Phi OrigInitVal, Check, PipelineLastVal, Epilog
+  //   goto OrigKernel
+  //
+  // OrigKernel:
+  //   // The original loop block.
+  //   if (LoopCounter != 0)
+  //     goto OrigKernel
+  //   goto NewExit
+  //
+  // NewExit:
+  //   // Newly created dedicated exit for the original loop.
+  //   // Merge values which are referenced after the loop
+  //   Merged = Phi OrigVal, OrigKernel, PipelineVal, Epilog
+  //   goto OrigExit
+  //
+  // OrigExit:
+  //   // The block that is originally the loop exit.
+  //   // If it is already deicated exit, NewExit is not created.
+
+  // An example of where each stage is executed:
+  // Assume #Stages 3, #MVE 4, #Iterations 12
+  // Iter   0 1 2 3 4 5 6 7 8 9 10-11
+  // -------------------------------------------------
+  // Stage  0                          Prolog#0
+  // Stage  1 0                        Prolog#1
+  // Stage  2 1 0                      Kernel Unroll#0 Iter#0
+  // Stage    2 1 0                    Kernel Unroll#1 Iter#0
+  // Stage      2 1 0                  Kernel Unroll#2 Iter#0
+  // Stage        2 1 0                Kernel Unroll#3 Iter#0
+  // Stage          2 1 0              Kernel Unroll#0 Iter#1
+  // Stage            2 1 0            Kernel Unroll#1 Iter#1
+  // Stage              2 1 0          Kernel Unroll#2 Iter#1
+  // Stage                2 1 0        Kernel Unroll#3 Iter#1
+  // Stage                  2 1        Epilog#0
+  // Stage                    2        Epilog#1
+  // Stage                      0-2    OrigKernel
+
+  LoopInfo = TII->analyzeLoopForPipelining(OrigKernel);
+  assert(LoopInfo && "Must be able to analyze loop!");
+
+  calcNumUnroll();
+
+  Check = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
+  Prolog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
+  NewKernel = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
+  Epilog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
+  NewPreheader = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
+
+  MF.insert(OrigKernel->getIterator(), Check);
+  MF.insert(OrigKernel->getIterator(), Prolog);
+  MF.insert(OrigKernel->getIterator(), NewKernel);
+  MF.insert(OrigKernel->getIterator(), Epilog);
+  MF.insert(OrigKernel->getIterator(), NewPreheader);
+
+  NewExit = createDedicatedExit(OrigKernel, OrigExit);
+
+  NewPreheader->transferSuccessorsAndUpdatePHIs(OrigPreheader);
+  TII->insertUnconditionalBranch(*NewPreheader, OrigKernel, DebugLoc());
+
+  OrigPreheader->addSuccessor(Check);
+  TII->removeBranch(*OrigPreheader);
+  TII->insertUnconditionalBranch(*OrigPreheader, Check, DebugLoc());
+
+  Check->addSuccessor(Prolog);
+  Check->addSuccessor(NewPreheader);
+
+  Prolog->addSuccessor(NewKernel);
+
+  NewKernel->addSuccessor(NewKernel);
+  NewKernel->addSuccessor(Epilog);
+
+  Epilog->addSuccessor(NewPreheader);
+  Epilog->addSuccessor(NewExit);
+
+  SmallVector<MachineOperand, 4> Cond;
+  LoopInfo->createTripCountGreaterCondition(
+      Schedule.getNumStages() + NumUnroll - 2, *Check, Cond,
+      LoopInfo->getCounterInitReg());
+  TII->insertBranch(*Check, Prolog, NewPreheader, Cond, DebugLoc());
+
+  // VRMaps map (prolog/kernel/epilog phase#, original register#) to new
+  // register#
+  SmallVector<ValueMapTy> PrologVRMap, KernelVRMap, EpilogVRMap;
+  generateProlog(PrologVRMap);
+  generateKernel(PrologVRMap, KernelVRMap);
+  generateEpilog(KernelVRMap, EpilogVRMap);
+}
+
+/// Replace MI's use operands according to the maps.
+void ModuloScheduleExpanderMVE::updateInstrUse(
+    MachineInstr *MI, int StageNum, int PhaseNum,
+    SmallVectorImpl<ValueMapTy> &CurVRMap,
+    SmallVectorImpl<ValueMapTy> *PrevVRMap) {
+  // If MI is in the prolog/kernel/epilog block, CurVRMap is
+  // PrologVRMap/KernelVRMap/EpilogVRMap respectively.
+  // PrevVRMap is nullptr/PhiVRMap/KernelVRMap respectively.
+  // Refer to the appropriate map according to the stage difference between
+  // MI and the definition of an operand.
+
+  for (MachineOperand &UseMO : MI->uses()) {
+    if (!UseMO.isReg() || !UseMO.getReg().isVirtual())
+      continue;
+    int DiffStage = 0;
+    Register OrigReg = UseMO.getReg();
+    MachineInstr *DefInst = MRI.getVRegDef(OrigReg);
+    if (!DefInst || DefInst->getParent() != OrigKernel)
+      continue;
+    unsigned InitReg = 0;
+    unsigned DefReg = OrigReg;
+    if (DefInst->isPHI()) {
+      ++DiffStage;
+      unsigned LoopReg;
+      getPhiRegs(*DefInst, OrigKernel, InitReg, LoopReg);
+      // LoopReg is guaranteed to be defined within the loop by canApply()
+      DefReg = LoopReg;
+      DefInst = MRI.getVRegDef(LoopReg);
+    }
+    unsigned DefStageNum = Schedule.getStage(DefInst);
+    DiffStage += StageNum - DefStageNum;
+    Register NewReg;
+    if (PhaseNum >= DiffStage && CurVRMap[PhaseNum - DiffStage].count(DefReg))
+      // NewReg is defined in a previous phase of the same block
+      NewReg = CurVRMap[PhaseNum - DiffStage][DefReg];
+    else if (!PrevVRMap)
+      // Since this is the first iteration, refer the initial register of the
+      // loop
+      NewReg = InitReg;
+    else
+      // Cases where DiffStage is larger than PhaseNum.
+      // If MI is in the kernel block, the value is defined by the previous
+      // iteration and PhiVRMap is referenced. If MI is in the epilog block, the
+      // value is defined in the kernel block and KernelVRMap is referenced.
+      NewReg = (*PrevVRMap)[PrevVRMap->size() - (DiffStage - PhaseNum)][DefReg];
+
+    const TargetRegisterClass *NRC =
+        MRI.constrainRegClass(NewReg, MRI.getRegClass(OrigReg));
+    if (NRC)
+      UseMO.setReg(NewReg);
+    else {
+      Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
+      BuildMI(*OrigKernel, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY),
+              SplitReg)
+          .addReg(NewReg);
+      UseMO.setReg(SplitReg);
+    }
+  }
+}
+
+/// Return a phi if Reg is referenced by the phi.
+/// canApply() guarantees that at most only one such phi exists.
+static MachineInstr *getLoopPhiUser(Register Reg, MachineBasicBlock *Loop) {
+  for (MachineInstr &Phi : Loop->phis()) {
+    unsigned InitVal, LoopVal;
+    getPhiRegs(Phi, Loop, InitVal, LoopVal);
+    if (LoopVal == Reg)
+      return Φ
+  }
+  return nullptr;
+}
+
+/// Generate phis for registers defined by OrigMI.
+void ModuloScheduleExpanderMVE::generatePhi(
+    MachineInstr *OrigMI, int UnrollNum,
+    SmallVectorImpl<ValueMapTy> &PrologVRMap,
+    SmallVectorImpl<ValueMapTy> &KernelVRMap,
+    SmallVectorImpl<ValueMapTy> &PhiVRMap) {
+  int StageNum = Schedule.getStage(OrigMI);
+  bool UsePrologReg;
+  if (Schedule.getNumStages() - NumUnroll + UnrollNum - 1 >= StageNum)
+    UsePrologReg = true;
+  else if (Schedule.getNumStages() - NumUnroll + UnrollNum == StageNum)
+    UsePrologReg = false;
+  else
+    return;
+
+  // Examples that show which stages are merged by phi.
+  // Meaning of the symbol following the stage number:
+  //   a/b: Stages with the same letter are merged (UsePrologReg == true)
+  //   +: Merged with the initial value (UsePrologReg == false)
+  //   *: No phis required
+  //
+  // #Stages 3, #MVE 4
+  // Iter   0 1 2 3 4 5 6 7 8
+  // -----------------------------------------
+  // Stage  0a                 Prolog#0
+  // Stage  1a 0b              Prolog#1
+  // Stage  2* 1* 0*           Kernel Unroll#0
+  // Stage     2* 1* 0+        Kernel Unroll#1
+  // Stage        2* 1+ 0a     Kernel Unroll#2
+  // Stage           2+ 1a 0b  Kernel Unroll#3
+  //
+  // #Stages 3, #MVE 2
+  // Iter   0 1 2 3 4 5 6 7 8
+  // -----------------------------------------
+  // Stage  0a                 Prolog#0
+  // Stage  1a 0b              Prolog#1
+  // Stage  2* 1+ 0a           Kernel Unroll#0
+  // Stage     2+ 1a 0b        Kernel Unroll#1
+  //
+  // #Stages 3, #MVE 1
+  // Iter   0 1 2 3 4 5 6 7 8
+  // -----------------------------------------
+  // Stage  0*                 Prolog#0
+  // Stage  1a 0b              Prolog#1
+  // Stage  2+ 1a 0b           Kernel Unroll#0
+
+  for (MachineOperand &DefMO : OrigMI->defs()) {
+    if (!DefMO.isReg())
+      continue;
+    Register OrigReg = DefMO.getReg();
+    auto NewReg = KernelVRMap[UnrollNum].find(OrigReg);
+    if (NewReg == KernelVRMap[UnrollNum].end())
+      continue;
+    Register CorrespondReg;
+    if (UsePrologReg) {
+      int PrologNum = Schedule.getNumStages() - NumUnroll + UnrollNum - 1;
+      CorrespondReg = PrologVRMap[PrologNum][OrigReg];
+    } else {
+      MachineInstr *Phi = getLoopPhiUser(OrigReg, OrigKernel);
+      if (!Phi)
+        continue;
+      CorrespondReg = getInitPhiReg(*Phi, OrigKernel);
+    }
+
+    assert(CorrespondReg.isValid());
+    Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
+    BuildMI(*NewKernel, NewKernel->getFirstNonPHI(), DebugLoc(),
+            TII->get(TargetOpcode::PHI), PhiReg)
+        .addReg(NewReg->second)
+        .addMBB(NewKernel)
+        .addReg(CorrespondReg)
+        .addMBB(Prolog);
+    PhiVRMap[UnrollNum][OrigReg] = PhiReg;
+  }
+}
+
+static void replacePhiSrc(MachineInstr &Phi, Register OrigReg, Register NewReg,
+                          MachineBasicBlock *NewMBB) {
+  for (unsigned Idx = 1; Idx < Phi.getNumOperands(); Idx += 2) {
+    if (Phi.getOperand(Idx).getReg() == OrigReg) {
+      Phi.getOperand(Idx).setReg(NewReg);
+      Phi.getOperand(Idx + 1).setMBB(NewMBB);
+      return;
+    }
+  }
+}
+
+/// Generate phis that merge values from multiple routes
+void ModuloScheduleExpanderMVE::mergeRegUsesAfterPipeline(Register OrigReg,
+                                                          Register NewReg) {
+  SmallVector<MachineOperand *> UsesAfterLoop;
+  SmallVector<MachineInstr *> LoopPhis;
+  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(OrigReg),
+                                         E = MRI.use_end();
+       I != E; ++I) {
+    MachineOperand &O = *I;
+    if (O.getParent()->getParent() != OrigKernel &&
+        O.getParent()->getParent() != Prolog &&
+        O.getParent()->getParent() != NewKernel &&
+        O.getParent()->getParent() != Epilog)
+      UsesAfterLoop.push_back(&O);
+    if (O.getParent()->getParent() == OrigKernel && O.getParent()->isPHI())
+      LoopPhis.push_back(O.getParent());
+  }
+
+  // Merge the route that only execute the pipelined loop (when there are no
+  // remaining iterations) with the route that execute the original loop.
+  if (!UsesAfterLoop.empty()) {
+    Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
+    BuildMI(*NewExit, NewExit->getFirstNonPHI(), DebugLoc(),
+            TII->get(TargetOpcode::PHI), PhiReg)
+        .addReg(OrigReg)
+        .addMBB(OrigKernel)
+        .addReg(NewReg)
+        .addMBB(Epilog);
+
+    for (MachineOperand *MO : UsesAfterLoop)
+      MO->setReg(PhiReg);
+
+    if (!LIS.hasInterval(PhiReg))
+      LIS.createEmptyInterval(PhiReg);
+  }
+
+  // Merge routes from the pipelined loop and the bypassed route before the
+  // original loop
+  if (!LoopPhis.empty()) {
+    for (MachineInstr *Phi : LoopPhis) {
+      unsigned InitReg, LoopReg;
+      getPhiRegs(*Phi, OrigKernel, InitReg, LoopReg);
+      Register NewInit = MRI.createVirtualRegister(MRI.getRegClass(InitReg));
+      BuildMI(*NewPreheader, NewPreheader->getFirstNonPHI(), Phi->getDebugLoc(),
+              TII->get(TargetOpcode::PHI), NewInit)
+          .addReg(InitReg)
+          .addMBB(Check)
+          .addReg(NewReg)
+          .addMBB(Epilog);
+      replacePhiSrc(*Phi, InitReg, NewInit, NewPreheader);
+    }
+  }
+}
+
+void ModuloScheduleExpanderMVE::generateProlog(
+    SmallVectorImpl<ValueMapTy> &PrologVRMap) {
+  PrologVRMap.clear();
+  PrologVRMap.resize(Schedule.getNumStages() - 1);
+  DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
+  for (int PrologNum = 0; PrologNum < Schedule.getNumStages() - 1;
+       ++PrologNum) {
+    for (MachineInstr *MI : Schedule.getInstructions()) {
+      if (MI->isPHI())
+        continue;
+      int StageNum = Schedule.getStage(MI);
+      if (StageNum > PrologNum)
+        continue;
+      MachineInstr *NewMI = cloneInstr(MI);
+      updateInstrDef(NewMI, PrologVRMap[PrologNum], false);
+      NewMIMap[NewMI] = {PrologNum, StageNum};
+      Prolog->push_back(NewMI);
+    }
+  }
+
+  for (auto I : NewMIMap) {
+    MachineInstr *MI = I.first;
+    int PrologNum = I.second.first;
+    int StageNum = I.second.second;
+    updateInstrUse(MI, StageNum, PrologNum, PrologVRMap, nullptr);
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "prolog:\n";
+    Prolog->dump();
+  });
+}
+
+void ModuloScheduleExpanderMVE::generateKernel(
+    SmallVectorImpl<ValueMapTy> &PrologVRMap,
+    SmallVectorImpl<ValueMapTy> &KernelVRMap) {
+  KernelVRMap.clear();
+  KernelVRMap.resize(NumUnroll);
+  SmallVector<ValueMapTy> PhiVRMap;
+  PhiVRMap.resize(NumUnroll);
+  DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
+  for (int UnrollNum = 0; UnrollNum < NumUnroll; ++UnrollNum) {
+    for (MachineInstr *MI : Schedule.getInstructions()) {
+      if (MI->isPHI())
+        continue;
+      int StageNum = Schedule.getStage(MI);
+      MachineInstr *NewMI = cloneInstr(MI);
+      updateInstrDef(NewMI, KernelVRMap[UnrollNum],
+                     (UnrollNum == NumUnroll - 1 && StageNum == 0));
+      generatePhi(MI, UnrollNum, PrologVRMap, KernelVRMap, PhiVRMap);
+      NewMIMap[NewMI] = {UnrollNum, StageNum};
+      NewKernel->push_back(NewMI);
+    }
+  }
+
+  for (auto I : NewMIMap) {
+    MachineInstr *MI = I.first;
+    int UnrollNum = I.second.first;
+    int StageNum = I.second.second;
+    updateInstrUse(MI, StageNum, UnrollNum, KernelVRMap, &PhiVRMap);
+  }
+
+  // If remaining trip count is greater than NumUnroll-1, loop continues
+  SmallVector<MachineOperand, 4> Cond;
+  LoopInfo->createTripCountGreaterCondition(
+      NumUnroll - 1, *NewKernel, Cond,
+      KernelVRMap[NumUnroll - 1][LoopInfo->getCounterUpdatedReg()]);
+  TII->insertBranch(*NewKernel, NewKernel, Epilog, Cond, DebugLoc());
+
+  LLVM_DEBUG({
+    dbgs() << "kernel:\n";
+    NewKernel->dump();
+  });
+}
+
+void ModuloScheduleExpanderMVE::generateEpilog(
+    SmallVectorImpl<ValueMapTy> &KernelVRMap,
+    SmallVectorImpl<ValueMapTy> &EpilogVRMap) {
+  EpilogVRMap.clear();
+  EpilogVRMap.resize(Schedule.getNumStages() - 1);
+  DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
+  for (int EpilogNum = 0; EpilogNum < Schedule.getNumStages() - 1;
+       ++EpilogNum) {
+    for (MachineInstr *MI : Schedule.getInstructions()) {
+      if (MI->isPHI())
+        continue;
+      int StageNum = Schedule.getStage(MI);
+      if (StageNum <= EpilogNum)
+        continue;
+      MachineInstr *NewMI = cloneInstr(MI);
+      updateInstrDef(NewMI, EpilogVRMap[EpilogNum], StageNum - 1 == EpilogNum);
+      NewMIMap[NewMI] = {EpilogNum, StageNum};
+      Epilog->push_back(NewMI);
+    }
+  }
+
+  for (auto I : NewMIMap) {
+    MachineInstr *MI = I.first;
+    int EpilogNum = I.second.first;
+    int StageNum = I.second.second;
+    updateInstrUse(MI, StageNum, EpilogNum, EpilogVRMap, &KernelVRMap);
+  }
+
+  // If there are remaining iterations, they are executed in the original loop
+  SmallVector<MachineOperand, 4> Cond;
+  LoopInfo->createTripCountGreaterCondition(
+      0, *Epilog, Cond,
+      KernelVRMap[NumUnroll - 1][LoopInfo->getCounterUpdatedReg()]);
+  TII->insertBranch(*Epilog, NewPreheader, NewExit, Cond, DebugLoc());
+
+  LLVM_DEBUG({
+    dbgs() << "epilog:\n";
+    Epilog->dump();
+  });
+}
+
+/// Calculate the number of unroll required and set it to NumUnroll
+void ModuloScheduleExpanderMVE::calcNumUnroll() {
+  DenseMap<MachineInstr *, unsigned> Inst2Idx;
+  NumUnroll = 1;
+  for (unsigned I = 0; I < Schedule.getInstructions().size(); ++I)
+    Inst2Idx[Schedule.getInstructions()[I]] = I;
+
+  for (MachineInstr *MI : Schedule.getInstructions()) {
+    if (MI->isPHI())
+      continue;
+    int StageNum = Schedule.getStage(MI);
+    for (const MachineOperand &MO : MI->uses()) {
+      if (!MO.isReg() || !MO.getReg().isVirtual())
+        continue;
+      MachineInstr *DefMI = MRI.getVRegDef(MO.getReg());
+      if (DefMI->getParent() != OrigKernel)
+        continue;
+
+      int NumUnrollLocal = 1;
+      if (DefMI->isPHI()) {
+        ++NumUnrollLocal;
+        // canApply() guarantees that DefMI is not phi and is an instruction in
+        // the loop
+        DefMI = MRI.getVRegDef(getLoopPhiReg(*DefMI, OrigKernel));
+      }
+      NumUnrollLocal += StageNum - Schedule.getStage(DefMI);
+      if (Inst2Idx[MI] <= Inst2Idx[DefMI])
+        --NumUnrollLocal;
+      NumUnroll = std::max(NumUnroll, NumUnrollLocal);
+    }
+  }
+  LLVM_DEBUG(dbgs() << "NumUnroll: " << NumUnroll << "\n");
+}
+
+/// Create new virtual registers for definitions of NewMI and update NewMI.
+/// If the definitions are referenced after the pipelined loop, phis are
+/// created to merge with other routes.
+void ModuloScheduleExpanderMVE::updateInstrDef(MachineInstr *NewMI,
+                                               ValueMapTy &VRMap,
+                                               bool LastDef) {
+  for (MachineOperand &MO : NewMI->operands()) {
+    if (!MO.isReg() || !MO.getReg().isVirtual() || !MO.isDef())
+      continue;
+    Register Reg = MO.getReg();
+    const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+    Register NewReg = MRI.createVirtualRegister(RC);
+    MO.setReg(NewReg);
+    VRMap[Reg] = NewReg;
+    if (LastDef)
+      mergeRegUsesAfterPipeline(Reg, NewReg);
+  }
+}
+
+void ModuloScheduleExpanderMVE::expand() {
+  OrigKernel = Schedule.getLoop()->getTopBlock();
+  OrigPreheader = Schedule.getLoop()->getLoopPreheader();
+  OrigExit = Schedule.getLoop()->getExitBlock();
+
+  LLVM_DEBUG(Schedule.dump());
+
+  generatePipelinedLoop();
+}
+
+/// Check if ModuloScheduleExpanderMVE can be applied to L
+bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
+  if (!L.getExitBlock()) {
+    LLVM_DEBUG(dbgs() << "Can not apply MVE expander\n";);
+    return false;
+  }
+
+  MachineBasicBlock *BB = L.getTopBlock();
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+
+  // Put some constraints on the operands of the phis to simplify the
+  // transformation
+  DenseSet<unsigned> UsedByPhi;
+  for (MachineInstr &MI : BB->phis()) {
+    // Registers defined by phis must be used only inside the loop and be never
+    // used by phis.
+    for (MachineOperand &MO : MI.defs())
+      if (MO.isReg())
+        for (MachineInstr &Ref : MRI.use_instructions(MO.getReg()))
+          if (Ref.getParent() != BB || Ref.isPHI()) {
+            LLVM_DEBUG(dbgs() << "Can not apply MVE expander\n";);
+            return false;
+          }
+
+    // A source register from the loop block must be defined inside the loop.
+    // A register defined inside the loop must be referenced by only one phi at
+    // most.
+    unsigned InitVal, LoopVal;
+    getPhiRegs(MI, MI.getParent(), InitVal, LoopVal);
+    if (!Register(LoopVal).isVirtual() ||
+        MRI.getVRegDef(LoopVal)->getParent() != BB) {
+      LLVM_DEBUG(dbgs() << "Can not apply MVE expander\n";);
+      return false;
+    }
+    if (UsedByPhi.count(LoopVal)) {
+      LLVM_DEBUG(dbgs() << "Can not apply MVE expander\n";);
+      return false;
+    }
+    UsedByPhi.insert(LoopVal);
+  }
+
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // ModuloScheduleTestPass implementation
 //===----------------------------------------------------------------------===//

>From 400880720788ab1104bc4fd5450ee6c605ad70c3 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Fri, 22 Sep 2023 05:19:58 +0000
Subject: [PATCH 02/11] fixup! [ModuloSchedule] Implement modulo variable
 expansion for pipelining

---
 llvm/lib/CodeGen/ModuloSchedule.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 06641ad9da5fc..830aad32ed9e7 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -2670,7 +2670,7 @@ void ModuloScheduleExpanderMVE::expand() {
 /// Check if ModuloScheduleExpanderMVE can be applied to L
 bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
   if (!L.getExitBlock()) {
-    LLVM_DEBUG(dbgs() << "Can not apply MVE expander\n";);
+    LLVM_DEBUG(dbgs() << "Can not apply MVE expander: No single exit block\n";);
     return false;
   }
 
@@ -2687,7 +2687,8 @@ bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
       if (MO.isReg())
         for (MachineInstr &Ref : MRI.use_instructions(MO.getReg()))
           if (Ref.getParent() != BB || Ref.isPHI()) {
-            LLVM_DEBUG(dbgs() << "Can not apply MVE expander\n";);
+            LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A phi result is "
+                                 "referenced outside of the loop or by phi.";);
             return false;
           }
 
@@ -2698,11 +2699,14 @@ bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
     getPhiRegs(MI, MI.getParent(), InitVal, LoopVal);
     if (!Register(LoopVal).isVirtual() ||
         MRI.getVRegDef(LoopVal)->getParent() != BB) {
-      LLVM_DEBUG(dbgs() << "Can not apply MVE expander\n";);
+      LLVM_DEBUG(
+          dbgs() << "Can not apply MVE expander: A phi source value coming "
+                    "from the loop is not defined in the loop.\n";);
       return false;
     }
     if (UsedByPhi.count(LoopVal)) {
-      LLVM_DEBUG(dbgs() << "Can not apply MVE expander\n";);
+      LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A value defined in the "
+                           "loop is referenced by two or more phis.\n";);
       return false;
     }
     UsedByPhi.insert(LoopVal);

>From f1343ac70b1c1f5eb6fbf9ed334b4aa42d0c8e30 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Fri, 22 Sep 2023 13:56:28 +0000
Subject: [PATCH 03/11] fixup! [ModuloSchedule] Implement modulo variable
 expansion for pipelining

---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h  | 35 ++++++++------------
 llvm/lib/CodeGen/MachinePipeliner.cpp        |  6 ++--
 llvm/lib/CodeGen/ModuloSchedule.cpp          | 20 +++++------
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp     |  9 +++++
 llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp |  9 +++++
 llvm/lib/Target/PowerPC/PPCInstrInfo.cpp     |  9 +++++
 6 files changed, 54 insertions(+), 34 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 71ff2aff8fb32..1d34be037b559 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -766,19 +766,18 @@ class TargetInstrInfo : public MCInstrInfo {
     createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
                                     SmallVectorImpl<MachineOperand> &Cond) = 0;
 
-    /// Create a condtion to determine if the remaining trip count represented
-    /// by the loop counter CounterReg is greater than TC. Some instructions
-    /// such as comparisons may be inserted at the bottom of MBB. CounterReg
-    /// must be accessible there.
+    /// Create a condition to determine if the remaining trip count for a phase
+    /// is greater than TC. Some instructions such as comparisons may be
+    /// inserted at the bottom of MBB. The all instructions expanded for the
+    /// phase must be inserted in MBB before calling this function. RegMap is
+    /// the map from the original registers to the expanded registers for the
+    /// phase.
     ///
-    /// The definition of the return value is the same as for the variant above.
-    virtual std::optional<bool>
-    createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
-                                    SmallVectorImpl<MachineOperand> &Cond,
-                                    Register CounterReg) {
-      llvm_unreachable(
-          "Target didn't implement createTripCountGreaterCondition");
-    }
+    /// MBB can also be a predecessor of the prologue block. Then RegMap must be
+    /// empty and the compared value is the initial value of the trip count.
+    virtual void createRemainingIterationsGreaterCondition(
+        int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
+        DenseMap<unsigned, unsigned> RegMap) = 0;
 
     /// Modify the loop such that the trip count is
     /// OriginalTC + TripCountAdjust.
@@ -794,15 +793,9 @@ class TargetInstrInfo : public MCInstrInfo {
     /// valid; the loop has been removed.
     virtual void disposed() = 0;
 
-    /// Return the initial value of the loop counter.
-    virtual Register getCounterInitReg() {
-      llvm_unreachable("Target didn't implement getCounterInitReg");
-    }
-
-    /// Return the updated value of the loop counter in the original loop.
-    virtual Register getCounterUpdatedReg() {
-      llvm_unreachable("Target didn't implement getCounterUpdatedReg");
-    }
+    /// Return true if the target can expand pipelined schedule with modulo
+    /// variable expansion.
+    virtual bool isMVEExpanderSupported() = 0;
   };
 
   /// Analyze loop L, which must be a single-basic-block loop, and if the
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index fbfb3f99ccef6..6fa127dcd6adc 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -681,9 +681,9 @@ void SwingSchedulerDAG::schedule() {
   if (ExperimentalCodeGen && NewInstrChanges.empty()) {
     PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
     MSE.expand();
-  }
-  if (MVECodeGen && NewInstrChanges.empty() &&
-      ModuloScheduleExpanderMVE::canApply(Loop)) {
+  } else if (MVECodeGen && NewInstrChanges.empty() &&
+             LoopPipelinerInfo->isMVEExpanderSupported() &&
+             ModuloScheduleExpanderMVE::canApply(Loop)) {
     ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
     MSE.expand();
   } else {
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 830aad32ed9e7..4ba3fecef4f06 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -2262,9 +2262,8 @@ void ModuloScheduleExpanderMVE::generatePipelinedLoop() {
   Epilog->addSuccessor(NewExit);
 
   SmallVector<MachineOperand, 4> Cond;
-  LoopInfo->createTripCountGreaterCondition(
-      Schedule.getNumStages() + NumUnroll - 2, *Check, Cond,
-      LoopInfo->getCounterInitReg());
+  LoopInfo->createRemainingIterationsGreaterCondition(
+      Schedule.getNumStages() + NumUnroll - 2, *Check, Cond, ValueMapTy());
   TII->insertBranch(*Check, Prolog, NewPreheader, Cond, DebugLoc());
 
   // VRMaps map (prolog/kernel/epilog phase#, original register#) to new
@@ -2552,9 +2551,8 @@ void ModuloScheduleExpanderMVE::generateKernel(
 
   // If remaining trip count is greater than NumUnroll-1, loop continues
   SmallVector<MachineOperand, 4> Cond;
-  LoopInfo->createTripCountGreaterCondition(
-      NumUnroll - 1, *NewKernel, Cond,
-      KernelVRMap[NumUnroll - 1][LoopInfo->getCounterUpdatedReg()]);
+  LoopInfo->createRemainingIterationsGreaterCondition(
+      NumUnroll - 1, *NewKernel, Cond, KernelVRMap[NumUnroll - 1]);
   TII->insertBranch(*NewKernel, NewKernel, Epilog, Cond, DebugLoc());
 
   LLVM_DEBUG({
@@ -2591,11 +2589,13 @@ void ModuloScheduleExpanderMVE::generateEpilog(
     updateInstrUse(MI, StageNum, EpilogNum, EpilogVRMap, &KernelVRMap);
   }
 
-  // If there are remaining iterations, they are executed in the original loop
+  // If there are remaining iterations, they are executed in the original loop.
+  // Instructions related to loop control, such as loop counter comparison,
+  // are indicated by shouldIgnoreForPipelining() and are assumed to be placed
+  // in stage 0. Thus, the map is for the last one in the kernel.
   SmallVector<MachineOperand, 4> Cond;
-  LoopInfo->createTripCountGreaterCondition(
-      0, *Epilog, Cond,
-      KernelVRMap[NumUnroll - 1][LoopInfo->getCounterUpdatedReg()]);
+  LoopInfo->createRemainingIterationsGreaterCondition(
+      0, *Epilog, Cond, KernelVRMap[NumUnroll - 1]);
   TII->insertBranch(*Epilog, NewPreheader, NewExit, Cond, DebugLoc());
 
   LLVM_DEBUG({
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 5d0468948dfb6..66783dfc29c7b 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -6823,11 +6823,20 @@ class ARMPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
       llvm_unreachable("Unknown EndLoop");
   }
 
+  void createRemainingIterationsGreaterCondition(
+      int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
+      DenseMap<unsigned, unsigned> RegMap) override {
+    llvm_unreachable(
+        "Target didn't implement createRemainingIterationsGreaterCondition");
+  }
+
   void setPreheader(MachineBasicBlock *NewPreheader) override {}
 
   void adjustTripCount(int TripCountAdjust) override {}
 
   void disposed() override {}
+
+  bool isMVEExpanderSupported() override { return false; }
 };
 
 void ARMPipelinerLoopInfo::bumpCrossIterationPressure(RegPressureTracker &RPT,
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index b9bf26ba7cca1..007a6e41b82aa 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -770,6 +770,13 @@ class HexagonPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
     return TripCount > TC;
   }
 
+  void createRemainingIterationsGreaterCondition(
+      int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
+      DenseMap<unsigned, unsigned> RegMap) override {
+    llvm_unreachable(
+        "Target didn't implement createRemainingIterationsGreaterCondition");
+  }
+
   void setPreheader(MachineBasicBlock *NewPreheader) override {
     NewPreheader->splice(NewPreheader->getFirstTerminator(), Loop->getParent(),
                          Loop);
@@ -798,6 +805,8 @@ class HexagonPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
   }
 
   void disposed() override { Loop->eraseFromParent(); }
+
+  bool isMVEExpanderSupported() override { return false; }
 };
 } // namespace
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 5f5eb31a5a85f..250b3ee352ad2 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -5439,6 +5439,13 @@ class PPCPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
     return TripCount > TC;
   }
 
+  void createRemainingIterationsGreaterCondition(
+      int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
+      DenseMap<unsigned, unsigned> RegMap) override {
+    llvm_unreachable(
+        "Target didn't implement createRemainingIterationsGreaterCondition");
+  }
+
   void setPreheader(MachineBasicBlock *NewPreheader) override {
     // Do nothing. We want the LOOP setup instruction to stay in the *old*
     // preheader, so we can use BDZ in the prologs to adapt the loop trip count.
@@ -5463,6 +5470,8 @@ class PPCPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
     // Ensure the loop setup instruction is deleted too.
     LoopCount->eraseFromParent();
   }
+
+  bool isMVEExpanderSupported() override { return false; }
 };
 } // namespace
 

>From 02e79a9c681bc261f0d9afbc239802d6b8eeb917 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Fri, 22 Sep 2023 14:38:57 +0000
Subject: [PATCH 04/11] fixup! [ModuloSchedule] Implement modulo variable
 expansion for pipelining

---
 llvm/lib/CodeGen/ModuloSchedule.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 4ba3fecef4f06..52b729f6c8c6a 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -2670,7 +2670,7 @@ void ModuloScheduleExpanderMVE::expand() {
 /// Check if ModuloScheduleExpanderMVE can be applied to L
 bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
   if (!L.getExitBlock()) {
-    LLVM_DEBUG(dbgs() << "Can not apply MVE expander: No single exit block\n";);
+    LLVM_DEBUG(dbgs() << "Can not apply MVE expander: No single exit block.\n";);
     return false;
   }
 
@@ -2688,7 +2688,7 @@ bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
         for (MachineInstr &Ref : MRI.use_instructions(MO.getReg()))
           if (Ref.getParent() != BB || Ref.isPHI()) {
             LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A phi result is "
-                                 "referenced outside of the loop or by phi.";);
+                                 "referenced outside of the loop or by phi.\n";);
             return false;
           }
 

>From 5f97d45838a2d9ebe7132bdd8fdf803ba22384aa Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Tue, 5 Mar 2024 22:40:25 +0900
Subject: [PATCH 05/11] fixup! [ModuloSchedule] Implement modulo variable
 expansion for pipelining

---
 llvm/include/llvm/CodeGen/ModuloSchedule.h   |  6 ++--
 llvm/include/llvm/CodeGen/TargetInstrInfo.h  | 15 ++++++----
 llvm/lib/CodeGen/ModuloSchedule.cpp          | 30 ++++++++++++--------
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp     |  2 +-
 llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp |  2 +-
 llvm/lib/Target/PowerPC/PPCInstrInfo.cpp     |  2 +-
 6 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ModuloSchedule.h b/llvm/include/llvm/CodeGen/ModuloSchedule.h
index 1a555728a0985..0e6fd2dabc64f 100644
--- a/llvm/include/llvm/CodeGen/ModuloSchedule.h
+++ b/llvm/include/llvm/CodeGen/ModuloSchedule.h
@@ -408,9 +408,11 @@ class ModuloScheduleExpanderMVE {
                    SmallVectorImpl<ValueMapTy> &KernelVRMap,
                    SmallVectorImpl<ValueMapTy> &PhiVRMap);
   void generateKernel(SmallVectorImpl<ValueMapTy> &PrologVRMap,
-                      SmallVectorImpl<ValueMapTy> &KernelVRMap);
+                      SmallVectorImpl<ValueMapTy> &KernelVRMap,
+                      InstrMapTy &LastStage0Insts);
   void generateEpilog(SmallVectorImpl<ValueMapTy> &KernelVRMap,
-                      SmallVectorImpl<ValueMapTy> &EpilogVRMap);
+                      SmallVectorImpl<ValueMapTy> &EpilogVRMap,
+                      InstrMapTy &LastStage0Insts);
   void mergeRegUsesAfterPipeline(Register OrigReg, Register NewReg);
 
   MachineInstr *cloneInstr(MachineInstr *OldMI);
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 1d34be037b559..e978fb7bfbf31 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -769,15 +769,18 @@ class TargetInstrInfo : public MCInstrInfo {
     /// Create a condition to determine if the remaining trip count for a phase
     /// is greater than TC. Some instructions such as comparisons may be
     /// inserted at the bottom of MBB. The all instructions expanded for the
-    /// phase must be inserted in MBB before calling this function. RegMap is
-    /// the map from the original registers to the expanded registers for the
-    /// phase.
+    /// phase must be inserted in MBB before calling this function.
+    /// LastStage0Insts is the map from the original instructions scheduled at
+    /// stage#0 to the expanded instructions for the last iteration of the
+    /// kernel. LastStage0Insts is intended to obtain the instruction that
+    /// refers the latest loop counter value.
     ///
-    /// MBB can also be a predecessor of the prologue block. Then RegMap must be
-    /// empty and the compared value is the initial value of the trip count.
+    /// MBB can also be a predecessor of the prologue block. Then
+    /// LastStage0Insts must be empty and the compared value is the initial
+    /// value of the trip count.
     virtual void createRemainingIterationsGreaterCondition(
         int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
-        DenseMap<unsigned, unsigned> RegMap) = 0;
+        DenseMap<MachineInstr *, MachineInstr *> LastStage0Insts) = 0;
 
     /// Modify the loop such that the trip count is
     /// OriginalTC + TripCountAdjust.
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 52b729f6c8c6a..b843a7fd2ad19 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -2263,15 +2263,16 @@ void ModuloScheduleExpanderMVE::generatePipelinedLoop() {
 
   SmallVector<MachineOperand, 4> Cond;
   LoopInfo->createRemainingIterationsGreaterCondition(
-      Schedule.getNumStages() + NumUnroll - 2, *Check, Cond, ValueMapTy());
+      Schedule.getNumStages() + NumUnroll - 2, *Check, Cond, InstrMapTy());
   TII->insertBranch(*Check, Prolog, NewPreheader, Cond, DebugLoc());
 
   // VRMaps map (prolog/kernel/epilog phase#, original register#) to new
   // register#
   SmallVector<ValueMapTy> PrologVRMap, KernelVRMap, EpilogVRMap;
+  InstrMapTy LastStage0Insts;
   generateProlog(PrologVRMap);
-  generateKernel(PrologVRMap, KernelVRMap);
-  generateEpilog(KernelVRMap, EpilogVRMap);
+  generateKernel(PrologVRMap, KernelVRMap, LastStage0Insts);
+  generateEpilog(KernelVRMap, EpilogVRMap, LastStage0Insts);
 }
 
 /// Replace MI's use operands according to the maps.
@@ -2522,18 +2523,21 @@ void ModuloScheduleExpanderMVE::generateProlog(
 
 void ModuloScheduleExpanderMVE::generateKernel(
     SmallVectorImpl<ValueMapTy> &PrologVRMap,
-    SmallVectorImpl<ValueMapTy> &KernelVRMap) {
+    SmallVectorImpl<ValueMapTy> &KernelVRMap, InstrMapTy &LastStage0Insts) {
   KernelVRMap.clear();
   KernelVRMap.resize(NumUnroll);
   SmallVector<ValueMapTy> PhiVRMap;
   PhiVRMap.resize(NumUnroll);
   DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
+  DenseMap<MachineInstr *, MachineInstr *> MIMapLastStage0;
   for (int UnrollNum = 0; UnrollNum < NumUnroll; ++UnrollNum) {
     for (MachineInstr *MI : Schedule.getInstructions()) {
       if (MI->isPHI())
         continue;
       int StageNum = Schedule.getStage(MI);
       MachineInstr *NewMI = cloneInstr(MI);
+      if (UnrollNum == NumUnroll - 1)
+        LastStage0Insts[MI] = NewMI;
       updateInstrDef(NewMI, KernelVRMap[UnrollNum],
                      (UnrollNum == NumUnroll - 1 && StageNum == 0));
       generatePhi(MI, UnrollNum, PrologVRMap, KernelVRMap, PhiVRMap);
@@ -2551,8 +2555,8 @@ void ModuloScheduleExpanderMVE::generateKernel(
 
   // If remaining trip count is greater than NumUnroll-1, loop continues
   SmallVector<MachineOperand, 4> Cond;
-  LoopInfo->createRemainingIterationsGreaterCondition(
-      NumUnroll - 1, *NewKernel, Cond, KernelVRMap[NumUnroll - 1]);
+  LoopInfo->createRemainingIterationsGreaterCondition(NumUnroll - 1, *NewKernel,
+                                                      Cond, LastStage0Insts);
   TII->insertBranch(*NewKernel, NewKernel, Epilog, Cond, DebugLoc());
 
   LLVM_DEBUG({
@@ -2563,7 +2567,7 @@ void ModuloScheduleExpanderMVE::generateKernel(
 
 void ModuloScheduleExpanderMVE::generateEpilog(
     SmallVectorImpl<ValueMapTy> &KernelVRMap,
-    SmallVectorImpl<ValueMapTy> &EpilogVRMap) {
+    SmallVectorImpl<ValueMapTy> &EpilogVRMap, InstrMapTy &LastStage0Insts) {
   EpilogVRMap.clear();
   EpilogVRMap.resize(Schedule.getNumStages() - 1);
   DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
@@ -2594,8 +2598,8 @@ void ModuloScheduleExpanderMVE::generateEpilog(
   // are indicated by shouldIgnoreForPipelining() and are assumed to be placed
   // in stage 0. Thus, the map is for the last one in the kernel.
   SmallVector<MachineOperand, 4> Cond;
-  LoopInfo->createRemainingIterationsGreaterCondition(
-      0, *Epilog, Cond, KernelVRMap[NumUnroll - 1]);
+  LoopInfo->createRemainingIterationsGreaterCondition(0, *Epilog, Cond,
+                                                      LastStage0Insts);
   TII->insertBranch(*Epilog, NewPreheader, NewExit, Cond, DebugLoc());
 
   LLVM_DEBUG({
@@ -2670,7 +2674,8 @@ void ModuloScheduleExpanderMVE::expand() {
 /// Check if ModuloScheduleExpanderMVE can be applied to L
 bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
   if (!L.getExitBlock()) {
-    LLVM_DEBUG(dbgs() << "Can not apply MVE expander: No single exit block.\n";);
+    LLVM_DEBUG(
+        dbgs() << "Can not apply MVE expander: No single exit block.\n";);
     return false;
   }
 
@@ -2687,8 +2692,9 @@ bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
       if (MO.isReg())
         for (MachineInstr &Ref : MRI.use_instructions(MO.getReg()))
           if (Ref.getParent() != BB || Ref.isPHI()) {
-            LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A phi result is "
-                                 "referenced outside of the loop or by phi.\n";);
+            LLVM_DEBUG(dbgs()
+                           << "Can not apply MVE expander: A phi result is "
+                              "referenced outside of the loop or by phi.\n";);
             return false;
           }
 
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 66783dfc29c7b..a8e590d59298f 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -6825,7 +6825,7 @@ class ARMPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
 
   void createRemainingIterationsGreaterCondition(
       int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
-      DenseMap<unsigned, unsigned> RegMap) override {
+      DenseMap<MachineInstr *, MachineInstr *> LastStage0Insts) override {
     llvm_unreachable(
         "Target didn't implement createRemainingIterationsGreaterCondition");
   }
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 007a6e41b82aa..c65d4099373f4 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -772,7 +772,7 @@ class HexagonPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
 
   void createRemainingIterationsGreaterCondition(
       int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
-      DenseMap<unsigned, unsigned> RegMap) override {
+      DenseMap<MachineInstr *, MachineInstr *> LastStage0Insts) override {
     llvm_unreachable(
         "Target didn't implement createRemainingIterationsGreaterCondition");
   }
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 250b3ee352ad2..bb1e298d9b2f7 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -5441,7 +5441,7 @@ class PPCPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
 
   void createRemainingIterationsGreaterCondition(
       int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
-      DenseMap<unsigned, unsigned> RegMap) override {
+      DenseMap<MachineInstr *, MachineInstr *> LastStage0Insts) override {
     llvm_unreachable(
         "Target didn't implement createRemainingIterationsGreaterCondition");
   }

>From 38a765b24f0cda9358d7ff91bda92e8b75822952 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Tue, 9 Apr 2024 17:04:20 +0000
Subject: [PATCH 06/11] fixup! [ModuloSchedule] Implement modulo variable
 expansion for pipelining

---
 llvm/lib/CodeGen/ModuloSchedule.cpp           |   2 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 373 +++++++++++++++---
 .../CodeGen/AArch64/sms-acceptable-loop3.mir  |  80 ----
 .../CodeGen/AArch64/sms-acceptable-loop4.mir  |  80 ----
 llvm/test/CodeGen/AArch64/sms-mve1.mir        | 144 +++++++
 llvm/test/CodeGen/AArch64/sms-mve2.mir        | 129 ++++++
 llvm/test/CodeGen/AArch64/sms-mve3.mir        | 116 ++++++
 llvm/test/CodeGen/AArch64/sms-mve4.mir        | 130 ++++++
 llvm/test/CodeGen/AArch64/sms-mve5.mir        | 140 +++++++
 llvm/test/CodeGen/AArch64/sms-mve6.mir        | 138 +++++++
 llvm/test/CodeGen/AArch64/sms-mve7.mir        | 128 ++++++
 llvm/test/CodeGen/AArch64/sms-mve8.mir        | 138 +++++++
 llvm/test/CodeGen/AArch64/sms-mve9.mir        | 152 +++++++
 .../CodeGen/AArch64/sms-unpipeline-insts2.mir |  81 ----
 14 files changed, 1545 insertions(+), 286 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir
 delete mode 100644 llvm/test/CodeGen/AArch64/sms-acceptable-loop4.mir
 create mode 100644 llvm/test/CodeGen/AArch64/sms-mve1.mir
 create mode 100644 llvm/test/CodeGen/AArch64/sms-mve2.mir
 create mode 100644 llvm/test/CodeGen/AArch64/sms-mve3.mir
 create mode 100644 llvm/test/CodeGen/AArch64/sms-mve4.mir
 create mode 100644 llvm/test/CodeGen/AArch64/sms-mve5.mir
 create mode 100644 llvm/test/CodeGen/AArch64/sms-mve6.mir
 create mode 100644 llvm/test/CodeGen/AArch64/sms-mve7.mir
 create mode 100644 llvm/test/CodeGen/AArch64/sms-mve8.mir
 create mode 100644 llvm/test/CodeGen/AArch64/sms-mve9.mir
 delete mode 100644 llvm/test/CodeGen/AArch64/sms-unpipeline-insts2.mir

diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index b843a7fd2ad19..b6527445e08e9 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -2394,7 +2394,7 @@ void ModuloScheduleExpanderMVE::generatePhi(
   // Stage  2+ 1a 0b           Kernel Unroll#0
 
   for (MachineOperand &DefMO : OrigMI->defs()) {
-    if (!DefMO.isReg())
+    if (!DefMO.isReg() || DefMO.isDead())
       continue;
     Register OrigReg = DefMO.getReg();
     auto NewReg = KernelVRMap[UnrollNum].find(OrigReg);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 22687b0e31c28..5fb868a6ee5f1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -17,6 +17,7 @@
 #include "AArch64PointerAuth.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
@@ -9572,18 +9573,49 @@ AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
 
 namespace {
 class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
-  MachineInstr *PredBranch;
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo &MRI;
+
+  /// The block of the loop
+  MachineBasicBlock *LoopBB;
+  /// The conditional branch of the loop
+  MachineInstr *CondBranch;
+  /// The compare instruction for loop control
+  MachineInstr *Comp;
+  /// The number of the operand of the loop counter value in Comp
+  unsigned CompCounterOprNum;
+  /// The instruction that updates the loop counter value
+  MachineInstr *Update;
+  /// The number of the operand of the loop counter value in Update
+  unsigned UpdateCounterOprNum;
+  /// The initial value of the loop counter
+  Register Init;
+  /// True iff Update is a predecessor of Comp
+  bool IsUpdatePriorComp;
+
+  /// The normalized condition used by createTripCountGreaterCondition()
   SmallVector<MachineOperand, 4> Cond;
 
 public:
-  AArch64PipelinerLoopInfo(MachineInstr *PredBranch,
+  AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
+                           MachineInstr *Comp, unsigned CompCounterOprNum,
+                           MachineInstr *Update, unsigned UpdateCounterOprNum,
+                           Register Init, bool IsUpdatePriorComp,
                            const SmallVectorImpl<MachineOperand> &Cond)
-      : PredBranch(PredBranch), Cond(Cond.begin(), Cond.end()) {}
+      : MF(Comp->getParent()->getParent()),
+        TII(MF->getSubtarget().getInstrInfo()),
+        TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
+        LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
+        CompCounterOprNum(CompCounterOprNum), Update(Update),
+        UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
+        IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
 
   bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
     // Make the instructions for loop control be placed in stage 0.
-    // The predecessors of PredBranch are considered by the caller.
-    return MI == PredBranch;
+    // The predecessors of Comp are considered by the caller.
+    return MI == Comp;
   }
 
   std::optional<bool> createTripCountGreaterCondition(
@@ -9596,31 +9628,256 @@ class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
     return {};
   }
 
+  void createRemainingIterationsGreaterCondition(
+      int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
+      DenseMap<MachineInstr *, MachineInstr *> LastStage0Insts) override;
+
   void setPreheader(MachineBasicBlock *NewPreheader) override {}
 
   void adjustTripCount(int TripCountAdjust) override {}
 
   void disposed() override {}
+  bool isMVEExpanderSupported() override { return true; }
 };
 } // namespace
 
-static bool isCompareAndBranch(unsigned Opcode) {
-  switch (Opcode) {
-  case AArch64::CBZW:
-  case AArch64::CBZX:
-  case AArch64::CBNZW:
-  case AArch64::CBNZX:
-  case AArch64::TBZW:
-  case AArch64::TBZX:
-  case AArch64::TBNZW:
-  case AArch64::TBNZX:
-    return true;
+/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
+/// is replaced by ReplaceReg. The output register is newly created.
+/// The other operands are unchanged from MI.
+static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
+                           Register ReplaceReg, MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator InsertTo) {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *TRI =
+      MBB.getParent()->getSubtarget().getRegisterInfo();
+  MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
+  Register Result = 0;
+  for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
+    if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
+      Result = MRI.createVirtualRegister(
+          MRI.getRegClass(NewMI->getOperand(0).getReg()));
+      NewMI->getOperand(I).setReg(Result);
+    } else if (I == ReplaceOprNum) {
+      MRI.constrainRegClass(
+          ReplaceReg,
+          TII->getRegClass(NewMI->getDesc(), I, TRI, *MBB.getParent()));
+      NewMI->getOperand(I).setReg(ReplaceReg);
+    }
   }
-  return false;
+  MBB.insert(InsertTo, NewMI);
+  return Result;
+}
+
+void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
+    int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
+    DenseMap<MachineInstr *, MachineInstr *> LastStage0Insts) {
+  // Create and accumulate conditions for next TC iterations.
+  // Example:
+  //   SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
+  //                                          # iteration of the kernel
+  //
+  //   # insert the following instructions
+  //   cond = CSINCXr 0, 0, C, implicit $nzcv
+  //   counter = ADDXri counter, 1            # clone from this->Update
+  //   SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
+  //   cond = CSINCXr cond, cond, C, implicit $nzcv
+  //   ... (repeat TC times)
+  //   SUBSXri cond, 0, implicit-def $nzcv
+
+  assert(CondBranch->getOpcode() == AArch64::Bcc);
+  // CondCode to exit the loop
+  AArch64CC::CondCode CC =
+      (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
+  if (CondBranch->getOperand(1).getMBB() == LoopBB)
+    CC = AArch64CC::getInvertedCondCode(CC);
+
+  // Accumulate conditions to exit the loop
+  Register AccCond = AArch64::XZR;
+
+  // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
+  auto AccumulateCond = [&](Register CurCond,
+                            AArch64CC::CondCode CC) -> Register {
+    Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+    BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
+        .addReg(NewCond, RegState::Define)
+        .addReg(CurCond)
+        .addReg(CurCond)
+        .addImm(AArch64CC::getInvertedCondCode(CC));
+    return NewCond;
+  };
+
+  if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
+    // Update and Comp for I==0 are already exists in MBB
+    // (MBB is an unrolled kernel)
+    Register Counter;
+    for (int I = 0; I <= TC; ++I) {
+      Register NextCounter;
+      if (I != 0)
+        NextCounter =
+            cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
+
+      AccCond = AccumulateCond(AccCond, CC);
+
+      if (I != TC) {
+        if (I == 0) {
+          if (Update != Comp && IsUpdatePriorComp) {
+            Counter =
+                LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
+            NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
+                                     MBB.end());
+          } else {
+            // can use already calculated value
+            NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
+          }
+        } else if (Update != Comp) {
+          NextCounter =
+              cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
+        }
+      }
+      Counter = NextCounter;
+    }
+  } else {
+    Register Counter;
+    if (LastStage0Insts.empty()) {
+      // use initial counter value (testing if the trip count is sufficient to
+      // be executed by pipelined code)
+      Counter = Init;
+      if (IsUpdatePriorComp)
+        Counter =
+            cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
+    } else {
+      // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
+      Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
+    }
+
+    for (int I = 0; I <= TC; ++I) {
+      Register NextCounter;
+      NextCounter =
+          cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
+      AccCond = AccumulateCond(AccCond, CC);
+      if (I != TC && Update != Comp)
+        NextCounter =
+            cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
+      Counter = NextCounter;
+    }
+  }
+
+  // If AccCond == 0, the reminder is greater than TC.
+  BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
+      .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
+      .addReg(AccCond)
+      .addImm(0)
+      .addImm(0);
+  Cond.clear();
+  Cond.push_back(MachineOperand::CreateImm(AArch64CC::EQ));
+}
+
+static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
+                          Register *RegMBB, Register *RegOther) {
+  assert(Phi.getNumOperands() == 5);
+  if (Phi.getOperand(2).getMBB() == MBB) {
+    *RegMBB = Phi.getOperand(1).getReg();
+    *RegOther = Phi.getOperand(3).getReg();
+  } else {
+    assert(Phi.getOperand(4).getMBB() == MBB);
+    *RegMBB = Phi.getOperand(3).getReg();
+    *RegOther = Phi.getOperand(1).getReg();
+  }
+}
+
+static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {
+  if (!Reg.isVirtual())
+    return false;
+  const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  return MRI.getVRegDef(Reg)->getParent() != BB;
+}
+
+/// If Reg is an induction variable, return true and set some parameters
+static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
+                          MachineInstr *&UpdateInst,
+                          unsigned *UpdateCounterOprNum, Register *InitReg,
+                          bool *IsUpdatePriorComp) {
+  if (LoopBB->pred_size() != 2)
+    return false;
+  if (!Reg.isVirtual())
+    return false;
+  const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
+  UpdateInst = nullptr;
+  *UpdateCounterOprNum = 0;
+  *InitReg = 0;
+  *IsUpdatePriorComp = true;
+  Register CurReg = Reg;
+  while (true) {
+    MachineInstr *Def = MRI.getVRegDef(CurReg);
+    if (Def->getParent() != LoopBB)
+      return false;
+    if (Def->isCopy()) {
+      CurReg = Def->getOperand(1).getReg();
+    } else if (Def->isPHI()) {
+      if (*InitReg != 0)
+        return false;
+      if (!UpdateInst)
+        *IsUpdatePriorComp = false;
+      extractPhiReg(*Def, LoopBB, &CurReg, InitReg);
+    } else {
+      if (UpdateInst)
+        return false;
+      switch (Def->getOpcode()) {
+      case AArch64::ADDSXri:
+      case AArch64::ADDSWri:
+      case AArch64::SUBSXri:
+      case AArch64::SUBSWri:
+      case AArch64::ADDXri:
+      case AArch64::ADDWri:
+      case AArch64::SUBXri:
+      case AArch64::SUBWri:
+        UpdateInst = Def;
+        *UpdateCounterOprNum = 1;
+        break;
+      case AArch64::ADDSXrr:
+      case AArch64::ADDSWrr:
+      case AArch64::SUBSXrr:
+      case AArch64::SUBSWrr:
+      case AArch64::ADDXrr:
+      case AArch64::ADDWrr:
+      case AArch64::SUBXrr:
+      case AArch64::SUBWrr:
+        UpdateInst = Def;
+        if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
+          *UpdateCounterOprNum = 1;
+        else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
+          *UpdateCounterOprNum = 2;
+        else
+          return false;
+        break;
+      default:
+        return false;
+      }
+      CurReg = Def->getOperand(*UpdateCounterOprNum).getReg();
+    }
+
+    if (!CurReg.isVirtual())
+      return false;
+    if (Reg == CurReg)
+      break;
+  }
+
+  if (!UpdateInst)
+    return false;
+
+  return true;
 }
 
 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
 AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
+  // Accept loops that meet the following conditions
+  // * The conditional branch is BCC
+  // * The compare instruction is ADDS/SUBS/WHILEXX
+  // * One operand of the compare is an induction variable and the other is a
+  //   loop invariant value
+  // * The induction variable is incremented/decremented by a single instruction
+
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
@@ -9631,48 +9888,76 @@ AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
     return nullptr;
 
   // Must be conditional branch
-  if (FBB == nullptr)
+  if (TBB != LoopBB && FBB == nullptr)
     return nullptr;
 
   assert((TBB == LoopBB || FBB == LoopBB) &&
          "The Loop must be a single-basic-block loop");
 
+  MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
+  const TargetRegisterInfo &TRI = getRegisterInfo();
+
+  if (CondBranch->getOpcode() != AArch64::Bcc)
+    return nullptr;
+
   // Normalization for createTripCountGreaterCondition()
   if (TBB == LoopBB)
     reverseBranchCondition(Cond);
 
-  MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
-  const TargetRegisterInfo &TRI = getRegisterInfo();
-
-  // Find the immediate predecessor of the conditional branch
-  MachineInstr *PredBranch = nullptr;
-  if (CondBranch->getOpcode() == AArch64::Bcc) {
-    for (MachineInstr &MI : reverse(*LoopBB)) {
-      if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
-        PredBranch = &MI;
+  MachineInstr *Comp = nullptr;
+  unsigned CompCounterOprNum = 0;
+  for (MachineInstr &MI : reverse(*LoopBB)) {
+    if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
+      // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
+      // operands is a loop invariant value
+
+      switch (MI.getOpcode()) {
+      case AArch64::SUBSXri:
+      case AArch64::SUBSWri:
+      case AArch64::ADDSXri:
+      case AArch64::ADDSWri:
+        Comp = &MI;
+        CompCounterOprNum = 1;
+        break;
+      case AArch64::ADDSWrr:
+      case AArch64::ADDSXrr:
+      case AArch64::SUBSWrr:
+      case AArch64::SUBSXrr:
+        Comp = &MI;
         break;
+      default:
+        if (isWhileOpcode(MI.getOpcode())) {
+          Comp = &MI;
+          break;
+        }
+        return nullptr;
       }
-    }
-    if (!PredBranch)
-      return nullptr;
-  } else if (isCompareAndBranch(CondBranch->getOpcode())) {
-    const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
-    Register Reg = CondBranch->getOperand(0).getReg();
-    if (!Reg.isVirtual())
-      return nullptr;
-    PredBranch = MRI.getVRegDef(Reg);
 
-    // MachinePipeliner does not expect that the immediate predecessor is a Phi
-    if (PredBranch->isPHI())
-      return nullptr;
+      if (CompCounterOprNum == 0) {
+        if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
+          CompCounterOprNum = 2;
+        else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
+          CompCounterOprNum = 1;
+        else
+          return nullptr;
+      }
+      break;
+    }
+  }
+  if (!Comp)
+    return nullptr;
 
-    if (PredBranch->getParent() != LoopBB)
-      return nullptr;
-  } else {
+  MachineInstr *Update = nullptr;
+  Register Init;
+  bool IsUpdatePriorComp;
+  unsigned UpdateCounterOprNum;
+  if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
+                     Update, &UpdateCounterOprNum, &Init, &IsUpdatePriorComp))
     return nullptr;
-  }
 
-  return std::make_unique<AArch64PipelinerLoopInfo>(PredBranch, Cond);
+  return std::make_unique<AArch64PipelinerLoopInfo>(
+      LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
+      Init, IsUpdatePriorComp, Cond);
 }
 
 #define GET_INSTRINFO_HELPERS
diff --git a/llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir b/llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir
deleted file mode 100644
index 94dd299d1caa7..0000000000000
--- a/llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir
+++ /dev/null
@@ -1,80 +0,0 @@
-# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-enable-copytophi=0 -debug-only=pipeliner 2>&1 | FileCheck %s
-# REQUIRES: asserts
-
-# An acceptable loop by pipeliner: TBB == ExitBB, FBB == LoopBB, Compare and branch
-# CHECK: Schedule Found? 1
-
---- |
-  define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
-  entry:
-    %or.cond = icmp ult i32 %n, 2
-    br i1 %or.cond, label %for.end, label %for.body.preheader
-  
-  for.body.preheader:                               ; preds = %entry
-    %i.07 = add i32 %n, -1
-    %0 = sext i32 %i.07 to i64
-    br label %for.body
-  
-  for.body:                                         ; preds = %for.body.preheader, %for.body
-    %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-    %1 = shl nsw i64 %indvars.iv, 2
-    %scevgep = getelementptr i8, ptr %b, i64 %1
-    %2 = load float, ptr %scevgep, align 4
-    %add = fadd float %2, 1.000000e+00
-    %3 = shl nsw i64 %indvars.iv, 2
-    %scevgep11 = getelementptr i8, ptr %a, i64 %3
-    store float %add, ptr %scevgep11, align 4
-    %indvars.iv.next = add nsw i64 %indvars.iv, -1
-    %4 = add i64 %indvars.iv, -1
-    %5 = and i64 %4, 4294967295
-    %tobool.not = icmp eq i64 %5, 0
-    br i1 %tobool.not, label %for.end, label %for.body
-  
-  for.end:                                          ; preds = %for.body, %entry
-    ret void
-  }
-  
-...
----
-name:            func
-tracksRegLiveness: true
-liveins:
-  - { reg: '$x0', virtual-reg: '%3' }
-  - { reg: '$x1', virtual-reg: '%4' }
-  - { reg: '$w2', virtual-reg: '%5' }
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x1, $w2
-  
-    %5:gpr32common = COPY $w2
-    %4:gpr64common = COPY $x1
-    %3:gpr64common = COPY $x0
-    dead $wzr = SUBSWri %5, 2, 0, implicit-def $nzcv
-    Bcc 3, %bb.3, implicit $nzcv
-    B %bb.1
-  
-  bb.1.for.body.preheader:
-    %7:gpr32common = SUBWri %5, 1, 0
-    %9:gpr64all = IMPLICIT_DEF
-    %8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32
-    %10:gpr64 = SBFMXri killed %8, 0, 31
-    %0:gpr64all = COPY %10
-    %12:fpr32 = FMOVSi 112
-  
-  bb.2.for.body:
-    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
-  
-    %1:gpr64common = PHI %0, %bb.1, %2, %bb.2
-    %11:fpr32 = LDRSroX %4, %1, 0, 1 :: (load (s32) from %ir.scevgep)
-    %13:fpr32 = nofpexcept FADDSrr killed %11, %12, implicit $fpcr
-    STRSroX killed %13, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep11)
-    %14:gpr64common = SUBXri %1, 1, 0
-    %2:gpr64all = COPY %14
-    %15:gpr32 = COPY %14.sub_32
-    CBZW killed %15, %bb.3
-    B %bb.2
-  
-  bb.3.for.end:
-    RET_ReallyLR
-
-...
diff --git a/llvm/test/CodeGen/AArch64/sms-acceptable-loop4.mir b/llvm/test/CodeGen/AArch64/sms-acceptable-loop4.mir
deleted file mode 100644
index fbd74a777aa1e..0000000000000
--- a/llvm/test/CodeGen/AArch64/sms-acceptable-loop4.mir
+++ /dev/null
@@ -1,80 +0,0 @@
-# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-enable-copytophi=0 -debug-only=pipeliner 2>&1 | FileCheck %s
-# REQUIRES: asserts
-
-# An acceptable loop by pipeliner TBB == LoopBB, FBB == ExitBB, Compare and branch
-# CHECK: Schedule Found? 1
-
---- |
-  define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
-  entry:
-    %or.cond = icmp ult i32 %n, 2
-    br i1 %or.cond, label %for.end, label %for.body.preheader
-  
-  for.body.preheader:                               ; preds = %entry
-    %i.07 = add i32 %n, -1
-    %0 = sext i32 %i.07 to i64
-    br label %for.body
-  
-  for.body:                                         ; preds = %for.body.preheader, %for.body
-    %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-    %1 = shl nsw i64 %indvars.iv, 2
-    %scevgep = getelementptr i8, ptr %b, i64 %1
-    %2 = load float, ptr %scevgep, align 4
-    %add = fadd float %2, 1.000000e+00
-    %3 = shl nsw i64 %indvars.iv, 2
-    %scevgep11 = getelementptr i8, ptr %a, i64 %3
-    store float %add, ptr %scevgep11, align 4
-    %indvars.iv.next = add nsw i64 %indvars.iv, -1
-    %4 = add i64 %indvars.iv, -1
-    %5 = and i64 %4, 4294967295
-    %tobool.not = icmp eq i64 %5, 0
-    br i1 %tobool.not, label %for.end, label %for.body
-  
-  for.end:                                          ; preds = %for.body, %entry
-    ret void
-  }
-  
-...
----
-name:            func
-tracksRegLiveness: true
-liveins:
-  - { reg: '$x0', virtual-reg: '%3' }
-  - { reg: '$x1', virtual-reg: '%4' }
-  - { reg: '$w2', virtual-reg: '%5' }
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x1, $w2
-  
-    %5:gpr32common = COPY $w2
-    %4:gpr64common = COPY $x1
-    %3:gpr64common = COPY $x0
-    dead $wzr = SUBSWri %5, 2, 0, implicit-def $nzcv
-    Bcc 3, %bb.3, implicit $nzcv
-    B %bb.1
-  
-  bb.1.for.body.preheader:
-    %7:gpr32common = SUBWri %5, 1, 0
-    %9:gpr64all = IMPLICIT_DEF
-    %8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32
-    %10:gpr64 = SBFMXri killed %8, 0, 31
-    %0:gpr64all = COPY %10
-    %12:fpr32 = FMOVSi 112
-  
-  bb.2.for.body:
-    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
-  
-    %1:gpr64common = PHI %0, %bb.1, %2, %bb.2
-    %11:fpr32 = LDRSroX %4, %1, 0, 1 :: (load (s32) from %ir.scevgep)
-    %13:fpr32 = nofpexcept FADDSrr killed %11, %12, implicit $fpcr
-    STRSroX killed %13, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep11)
-    %14:gpr64common = SUBXri %1, 1, 0
-    %2:gpr64all = COPY %14
-    %15:gpr32 = COPY %14.sub_32
-    CBNZW killed %15, %bb.2
-    B %bb.3
-  
-  bb.3.for.end:
-    RET_ReallyLR
-
-...
diff --git a/llvm/test/CodeGen/AArch64/sms-mve1.mir b/llvm/test/CodeGen/AArch64/sms-mve1.mir
new file mode 100644
index 0000000000000..c7f187c807ead
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-mve1.mir
@@ -0,0 +1,144 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg -pipeliner-force-ii=3 -mcpu=neoverse-n1 2>&1 | FileCheck %s
+
+# test pipeliner code genearation by MVE algorithm
+# #stages: 2, unroll count: 3
+
+# the calculation result can be checked as follows (driver code written by C):
+# for (i=2; i<N; i++)
+#   func_noswp(i, 1) == func(i, 1)
+
+...
+---
+name:            func
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+  ; CHECK-NEXT:   [[FMOVDi:%[0-9]+]]:fpr64 = FMOVDi 1
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr1:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr]], [[CSINCXr]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr2]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr2:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr1]], [[CSINCXr1]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr3:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr2]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr3]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr3:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr2]], [[CSINCXr2]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr3]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.4, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr4:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr4]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr1:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:fpr64 = PHI [[FADDDrr5:%[0-9]+]], %bb.5, [[FMOVDi]], %bb.4
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr64 = PHI [[ADDXrr6:%[0-9]+]], %bb.5, [[COPY1]], %bb.4
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:fpr64 = PHI [[FADDDrr8:%[0-9]+]], %bb.5, [[FADDDrr]], %bb.4
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:gpr64 = PHI [[ADDXrr7:%[0-9]+]], %bb.5, [[ADDXrr4]], %bb.4
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:fpr64 = PHI [[FADDDrr10:%[0-9]+]], %bb.5, [[FADDDrr1]], %bb.4
+  ; CHECK-NEXT:   [[FADDDrr2:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr5:%[0-9]+]]:gpr64 = ADDXrr [[PHI3]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr3:%[0-9]+]]:fpr64 = FADDDrr [[PHI4]], [[PHI]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr5]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr4:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr2]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr5]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr6]]:gpr64 = ADDXrr [[ADDXrr5]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr6:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr4]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr6]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr7:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr5]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr8]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr7]]:gpr64 = ADDXrr [[ADDXrr6]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr9:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr7]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr7]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr10]]:fpr64 = FADDDrr [[FADDDrr8]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[CSINCXr4:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr8:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr7]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr8]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr5:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr4]], [[CSINCXr4]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr9:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr8]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr9]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr6:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr5]], [[CSINCXr5]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr6]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.5, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr11:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr10]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr7]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr7:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr7]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.7, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:fpr64 = PHI [[FMOVDi]], %bb.3, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:gpr64 = PHI [[COPY1]], %bb.3, [[ADDXrr7]], %bb.6
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:gpr64 = PHI [[PHI6]], %bb.7, [[ADDXrr10:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:fpr64 = PHI [[PHI5]], %bb.7, [[FADDDrr12:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[ADDXrr10]]:gpr64 = ADDXrr [[PHI7]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr10]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr12]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr13:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr12]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr14:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr13]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   Bcc 1, %bb.1, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:fpr64 = PHI [[FADDDrr12]], %bb.1, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:fpr64 = PHI [[FADDDrr14]], %bb.1, [[FADDDrr11]], %bb.6
+  ; CHECK-NEXT:   [[FMULDrr:%[0-9]+]]:fpr64 = FMULDrr [[PHI9]], [[PHI10]], implicit $fpcr
+  ; CHECK-NEXT:   $d0 = COPY [[FMULDrr]]
+  ; CHECK-NEXT:   RET_ReallyLR implicit $d0
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64 = COPY $x0
+    %11:gpr64 = COPY $x1
+    %20:fpr64 = FMOVDi 1
+
+  bb.1:
+    %12:gpr64 = PHI %11, %bb.0, %13, %bb.1
+    %24:fpr64 = PHI %20, %bb.0, %21, %bb.1
+    %13:gpr64 = ADDXrr %12, %11
+    dead $xzr = SUBSXrr %10, %13, implicit-def $nzcv
+    %21:fpr64 = FADDDrr %20, %24, implicit $fpcr
+    %22:fpr64 = FADDDrr %21, %24, implicit $fpcr
+    %23:fpr64 = FADDDrr %22, %24, implicit $fpcr
+    Bcc 1, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %25:fpr64 = FMULDrr %21, %23, implicit $fpcr
+    $d0 = COPY %25
+    RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-mve2.mir b/llvm/test/CodeGen/AArch64/sms-mve2.mir
new file mode 100644
index 0000000000000..103f374196977
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-mve2.mir
@@ -0,0 +1,129 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg -pipeliner-force-ii=4 -mcpu=neoverse-n1 2>&1 | FileCheck %s
+
+# test pipeliner code genearation by MVE algorithm
+# #stages: 2, unroll count: 2
+
+...
+---
+name:            func
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+  ; CHECK-NEXT:   [[FMOVDi:%[0-9]+]]:fpr64 = FMOVDi 1
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr1:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr]], [[CSINCXr]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr2]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr2:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr1]], [[CSINCXr1]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr2]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.4, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr3:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr3]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr1:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:fpr64 = PHI [[FADDDrr3:%[0-9]+]], %bb.5, [[FMOVDi]], %bb.4
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr64 = PHI [[ADDXrr4:%[0-9]+]], %bb.5, [[COPY1]], %bb.4
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:fpr64 = PHI [[FADDDrr6:%[0-9]+]], %bb.5, [[FADDDrr]], %bb.4
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:gpr64 = PHI [[ADDXrr5:%[0-9]+]], %bb.5, [[ADDXrr3]], %bb.4
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:fpr64 = PHI [[FADDDrr7:%[0-9]+]], %bb.5, [[FADDDrr1]], %bb.4
+  ; CHECK-NEXT:   [[FADDDrr2:%[0-9]+]]:fpr64 = FADDDrr [[PHI4]], [[PHI]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr3]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr4]]:gpr64 = ADDXrr [[PHI3]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr4]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr4:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr3]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr5:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr4]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr6]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr3]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr5]]:gpr64 = ADDXrr [[ADDXrr4]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr5]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr7]]:fpr64 = FADDDrr [[FADDDrr6]], [[FADDDrr3]], implicit $fpcr
+  ; CHECK-NEXT:   [[CSINCXr3:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr6:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr5]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr6]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr4:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr3]], [[CSINCXr3]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr4]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.5, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr8:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr7]], [[FADDDrr3]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr5]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr5:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr5]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.7, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:fpr64 = PHI [[FMOVDi]], %bb.3, [[FADDDrr6]], %bb.6
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:gpr64 = PHI [[COPY1]], %bb.3, [[ADDXrr5]], %bb.6
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:gpr64 = PHI [[PHI6]], %bb.7, [[ADDXrr7:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:fpr64 = PHI [[PHI5]], %bb.7, [[FADDDrr9:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[ADDXrr7]]:gpr64 = ADDXrr [[PHI7]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr7]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr9]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr10:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr9]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr11:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr10]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   Bcc 1, %bb.1, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:fpr64 = PHI [[FADDDrr9]], %bb.1, [[FADDDrr6]], %bb.6
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:fpr64 = PHI [[FADDDrr11]], %bb.1, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[FMULDrr:%[0-9]+]]:fpr64 = FMULDrr [[PHI9]], [[PHI10]], implicit $fpcr
+  ; CHECK-NEXT:   $d0 = COPY [[FMULDrr]]
+  ; CHECK-NEXT:   RET_ReallyLR implicit $d0
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64 = COPY $x0
+    %11:gpr64 = COPY $x1
+    %20:fpr64 = FMOVDi 1
+
+  bb.1:
+    %12:gpr64 = PHI %11, %bb.0, %13, %bb.1
+    %24:fpr64 = PHI %20, %bb.0, %21, %bb.1
+    %13:gpr64 = ADDXrr %12, %11
+    dead $xzr = SUBSXrr %10, %13, implicit-def $nzcv
+    %21:fpr64 = FADDDrr %20, %24, implicit $fpcr
+    %22:fpr64 = FADDDrr %21, %24, implicit $fpcr
+    %23:fpr64 = FADDDrr %22, %24, implicit $fpcr
+    Bcc 1, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %25:fpr64 = FMULDrr %21, %23, implicit $fpcr
+    $d0 = COPY %25
+    RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-mve3.mir b/llvm/test/CodeGen/AArch64/sms-mve3.mir
new file mode 100644
index 0000000000000..017383b46be0c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-mve3.mir
@@ -0,0 +1,116 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg -pipeliner-force-ii=3 -mcpu=neoverse-n1 2>&1 | FileCheck %s
+
+# test pipeliner code genearation by MVE algorithm
+# #stages: 2, unroll count: 1
+
+...
+---
+name:            func
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+  ; CHECK-NEXT:   [[FMOVDi:%[0-9]+]]:fpr64 = FMOVDi 1
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr1:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr]], [[CSINCXr]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr1]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.4, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr2]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr1:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr]], [[FADDDrr]], implicit $fpcr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:fpr64 = PHI [[FADDDrr2:%[0-9]+]], %bb.5, [[FADDDrr]], %bb.4
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr64 = PHI [[ADDXrr3:%[0-9]+]], %bb.5, [[ADDXrr2]], %bb.4
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:fpr64 = PHI [[FADDDrr4:%[0-9]+]], %bb.5, [[FADDDrr1]], %bb.4
+  ; CHECK-NEXT:   [[FADDDrr2]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr3]]:gpr64 = ADDXrr [[PHI1]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr3:%[0-9]+]]:fpr64 = FADDDrr [[PHI2]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr3]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr4]]:fpr64 = FADDDrr [[FADDDrr2]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[CSINCXr2:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr2]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.5, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr5:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr4]], [[FADDDrr4]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr3]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr3:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr3]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.7, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:fpr64 = PHI [[FMOVDi]], %bb.3, [[FADDDrr2]], %bb.6
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:gpr64 = PHI [[COPY1]], %bb.3, [[ADDXrr3]], %bb.6
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:gpr64 = PHI [[PHI4]], %bb.7, [[ADDXrr4:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:fpr64 = PHI [[PHI3]], %bb.7, [[FADDDrr6:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[ADDXrr4]]:gpr64 = ADDXrr [[PHI5]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr4]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr6]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI6]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr7:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr6]], [[FADDDrr6]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr8:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr7]], [[FADDDrr7]], implicit $fpcr
+  ; CHECK-NEXT:   Bcc 1, %bb.1, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:fpr64 = PHI [[FADDDrr6]], %bb.1, [[FADDDrr2]], %bb.6
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:fpr64 = PHI [[FADDDrr8]], %bb.1, [[FADDDrr5]], %bb.6
+  ; CHECK-NEXT:   [[FMULDrr:%[0-9]+]]:fpr64 = FMULDrr [[PHI7]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   $d0 = COPY [[FMULDrr]]
+  ; CHECK-NEXT:   RET_ReallyLR implicit $d0
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64 = COPY $x0
+    %11:gpr64 = COPY $x1
+    %20:fpr64 = FMOVDi 1
+
+  bb.1:
+    %12:gpr64 = PHI %11, %bb.0, %13, %bb.1
+    %24:fpr64 = PHI %20, %bb.0, %21, %bb.1
+    %13:gpr64 = ADDXrr %12, %11
+    dead $xzr = SUBSXrr %10, %13, implicit-def $nzcv
+    %21:fpr64 = FADDDrr %20, %24, implicit $fpcr
+    %22:fpr64 = FADDDrr %21, %21, implicit $fpcr
+    %23:fpr64 = FADDDrr %22, %22, implicit $fpcr
+    Bcc 1, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %25:fpr64 = FMULDrr %21, %23, implicit $fpcr
+    $d0 = COPY %25
+    RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-mve4.mir b/llvm/test/CodeGen/AArch64/sms-mve4.mir
new file mode 100644
index 0000000000000..203ce6ddfd2ec
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-mve4.mir
@@ -0,0 +1,130 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg -pipeliner-force-ii=3 -mcpu=neoverse-n1 2>&1 | FileCheck %s
+
+# test pipeliner code genearation by MVE algorithm
+# no dedicated exit
+
+...
+---
+name:            func
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $x0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+  ; CHECK-NEXT:   [[FMOVDi:%[0-9]+]]:fpr64 = FMOVDi 1
+  ; CHECK-NEXT:   dead [[SUBSXrr:%[0-9]+]]:gpr64 = SUBSXrr [[COPY]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.3, implicit $nzcv
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   B %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.8(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr1:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr]], [[CSINCXr]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr1]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.5, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr2]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr1:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr]], [[FADDDrr]], implicit $fpcr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.7(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:fpr64 = PHI [[FADDDrr2:%[0-9]+]], %bb.6, [[FADDDrr]], %bb.5
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr64 = PHI [[ADDXrr3:%[0-9]+]], %bb.6, [[ADDXrr2]], %bb.5
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:fpr64 = PHI [[FADDDrr4:%[0-9]+]], %bb.6, [[FADDDrr1]], %bb.5
+  ; CHECK-NEXT:   [[FADDDrr2]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr3]]:gpr64 = ADDXrr [[PHI1]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr3:%[0-9]+]]:fpr64 = FADDDrr [[PHI2]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr3]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr4]]:fpr64 = FADDDrr [[FADDDrr2]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[CSINCXr2:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr2]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.6, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x40000000), %bb.9(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr5:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr4]], [[FADDDrr4]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr3]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr3:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr3]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.8, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:fpr64 = PHI [[FMOVDi]], %bb.4, [[FADDDrr2]], %bb.7
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:gpr64 = PHI [[COPY1]], %bb.4, [[ADDXrr3]], %bb.7
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:fpr64 = PHI [[FADDDrr8:%[0-9]+]], %bb.2, [[FADDDrr5]], %bb.7
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.9(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:gpr64 = PHI [[PHI4]], %bb.8, [[ADDXrr4:%[0-9]+]], %bb.2
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:fpr64 = PHI [[PHI3]], %bb.8, [[FADDDrr6:%[0-9]+]], %bb.2
+  ; CHECK-NEXT:   [[ADDXrr4]]:gpr64 = ADDXrr [[PHI6]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr4]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr6]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI7]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr7:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr6]], [[FADDDrr6]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr8]]:fpr64 = FADDDrr [[FADDDrr7]], [[FADDDrr7]], implicit $fpcr
+  ; CHECK-NEXT:   Bcc 1, %bb.2, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:fpr64 = PHI [[FMOVDi]], %bb.0, [[PHI5]], %bb.9
+  ; CHECK-NEXT:   $d0 = COPY [[PHI8]]
+  ; CHECK-NEXT:   RET_ReallyLR implicit $d0
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64 = COPY $x0
+    %11:gpr64 = COPY $x1
+    %20:fpr64 = FMOVDi 1
+    dead %15:gpr64 = SUBSXrr %10, %11, implicit-def $nzcv
+    Bcc 0, %bb.3, implicit $nzcv
+
+  bb.1:
+
+  bb.2:
+    %12:gpr64 = PHI %11, %bb.1, %13, %bb.2
+    %24:fpr64 = PHI %20, %bb.1, %21, %bb.2
+    %13:gpr64 = ADDXrr %12, %11
+    dead $xzr = SUBSXrr %10, %13, implicit-def $nzcv
+    %21:fpr64 = FADDDrr %20, %24, implicit $fpcr
+    %22:fpr64 = FADDDrr %21, %21, implicit $fpcr
+    %23:fpr64 = FADDDrr %22, %22, implicit $fpcr
+    Bcc 1, %bb.2, implicit $nzcv
+    B %bb.3
+
+  bb.3:
+    %25:fpr64 = PHI %20, %bb.0, %23, %bb.2
+    $d0 = COPY %25
+    RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-mve5.mir b/llvm/test/CodeGen/AArch64/sms-mve5.mir
new file mode 100644
index 0000000000000..4795df70c07a4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-mve5.mir
@@ -0,0 +1,140 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg -pipeliner-force-ii=3 -mcpu=neoverse-n1 2>&1 | FileCheck %s
+
+# test pipeliner code genearation by MVE algorithm
+# exit loop when condition holds
+
+...
+---
+name:            func
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+  ; CHECK-NEXT:   [[FMOVDi:%[0-9]+]]:fpr64 = FMOVDi 1
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr1:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr]], [[CSINCXr]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr2]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr2:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr1]], [[CSINCXr1]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr3:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr2]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr3]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr3:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr2]], [[CSINCXr2]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr3]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.4, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr4:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr4]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr1:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:fpr64 = PHI [[FADDDrr5:%[0-9]+]], %bb.5, [[FMOVDi]], %bb.4
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr64 = PHI [[ADDXrr6:%[0-9]+]], %bb.5, [[COPY1]], %bb.4
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:fpr64 = PHI [[FADDDrr8:%[0-9]+]], %bb.5, [[FADDDrr]], %bb.4
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:gpr64 = PHI [[ADDXrr7:%[0-9]+]], %bb.5, [[ADDXrr4]], %bb.4
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:fpr64 = PHI [[FADDDrr10:%[0-9]+]], %bb.5, [[FADDDrr1]], %bb.4
+  ; CHECK-NEXT:   [[FADDDrr2:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr5:%[0-9]+]]:gpr64 = ADDXrr [[PHI3]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr3:%[0-9]+]]:fpr64 = FADDDrr [[PHI4]], [[PHI]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr5]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr4:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr2]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr5]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr6]]:gpr64 = ADDXrr [[ADDXrr5]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr6:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr4]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr6]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr7:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr5]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr8]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr7]]:gpr64 = ADDXrr [[ADDXrr6]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr9:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr7]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr7]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr10]]:fpr64 = FADDDrr [[FADDDrr8]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[CSINCXr4:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr8:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr7]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr8]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr5:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr4]], [[CSINCXr4]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr9:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr8]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr9]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr6:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr5]], [[CSINCXr5]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr6]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.5, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr11:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr10]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr7]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr7:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr7]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.7, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:fpr64 = PHI [[FMOVDi]], %bb.3, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:gpr64 = PHI [[COPY1]], %bb.3, [[ADDXrr7]], %bb.6
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:gpr64 = PHI [[PHI6]], %bb.7, [[ADDXrr10:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:fpr64 = PHI [[PHI5]], %bb.7, [[FADDDrr12:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[ADDXrr10]]:gpr64 = ADDXrr [[PHI7]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr10]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr12]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr13:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr12]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr14:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr13]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   Bcc 0, %bb.2, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:fpr64 = PHI [[FADDDrr12]], %bb.1, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:fpr64 = PHI [[FADDDrr14]], %bb.1, [[FADDDrr11]], %bb.6
+  ; CHECK-NEXT:   [[FMULDrr:%[0-9]+]]:fpr64 = FMULDrr [[PHI9]], [[PHI10]], implicit $fpcr
+  ; CHECK-NEXT:   $d0 = COPY [[FMULDrr]]
+  ; CHECK-NEXT:   RET_ReallyLR implicit $d0
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64 = COPY $x0
+    %11:gpr64 = COPY $x1
+    %20:fpr64 = FMOVDi 1
+
+  bb.1:
+    %12:gpr64 = PHI %11, %bb.0, %13, %bb.1
+    %24:fpr64 = PHI %20, %bb.0, %21, %bb.1
+    %13:gpr64 = ADDXrr %12, %11
+    dead $xzr = SUBSXrr %10, %13, implicit-def $nzcv
+    %21:fpr64 = FADDDrr %20, %24, implicit $fpcr
+    %22:fpr64 = FADDDrr %21, %24, implicit $fpcr
+    %23:fpr64 = FADDDrr %22, %24, implicit $fpcr
+    Bcc 0, %bb.2, implicit $nzcv
+    B %bb.1
+
+  bb.2:
+    %25:fpr64 = FMULDrr %21, %23, implicit $fpcr
+    $d0 = COPY %25
+    RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-mve6.mir b/llvm/test/CodeGen/AArch64/sms-mve6.mir
new file mode 100644
index 0000000000000..527e9e9d09dfc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-mve6.mir
@@ -0,0 +1,138 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg -pipeliner-force-ii=3 -mcpu=neoverse-n1 2>&1 | FileCheck %s
+
+# test pipeliner code genearation by MVE algorithm
+# #stages: 2, unroll count: 3
+
+...
+---
+name:            func
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+  ; CHECK-NEXT:   [[FMOVDi:%[0-9]+]]:fpr64 = FMOVDi 1
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr1:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr]], [[CSINCXr]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr2:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr1]], [[CSINCXr1]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr2]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr3:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr2]], [[CSINCXr2]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr3]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.4, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr3:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr1:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:fpr64 = PHI [[FADDDrr5:%[0-9]+]], %bb.5, [[FMOVDi]], %bb.4
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr64 = PHI [[ADDXrr5:%[0-9]+]], %bb.5, [[COPY1]], %bb.4
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:fpr64 = PHI [[FADDDrr8:%[0-9]+]], %bb.5, [[FADDDrr]], %bb.4
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:gpr64 = PHI [[ADDXrr6:%[0-9]+]], %bb.5, [[ADDXrr3]], %bb.4
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:fpr64 = PHI [[FADDDrr10:%[0-9]+]], %bb.5, [[FADDDrr1]], %bb.4
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[PHI3]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr2:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr4:%[0-9]+]]:gpr64 = ADDXrr [[PHI3]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr3:%[0-9]+]]:fpr64 = FADDDrr [[PHI4]], [[PHI]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr4:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr2]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr4]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr5]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr5]]:gpr64 = ADDXrr [[ADDXrr4]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr6:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr4]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr7:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr5]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr5]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr8]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr6]]:gpr64 = ADDXrr [[ADDXrr5]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr9:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr7]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr10]]:fpr64 = FADDDrr [[FADDDrr8]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[CSINCXr4:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr6]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr5:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr4]], [[CSINCXr4]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr7:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr6]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr7]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr6:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr5]], [[CSINCXr5]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr6]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.5, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr11:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr10]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[ADDXrr5]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr7:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr7]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.7, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:fpr64 = PHI [[FMOVDi]], %bb.3, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:gpr64 = PHI [[COPY1]], %bb.3, [[ADDXrr6]], %bb.6
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:gpr64 = PHI [[PHI6]], %bb.7, [[ADDXrr8:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:fpr64 = PHI [[PHI5]], %bb.7, [[FADDDrr12:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY]], [[PHI7]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[ADDXrr8]]:gpr64 = ADDXrr [[PHI7]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr12]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr13:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr12]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr14:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr13]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   Bcc 1, %bb.1, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:fpr64 = PHI [[FADDDrr12]], %bb.1, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:fpr64 = PHI [[FADDDrr14]], %bb.1, [[FADDDrr11]], %bb.6
+  ; CHECK-NEXT:   [[FMULDrr:%[0-9]+]]:fpr64 = FMULDrr [[PHI9]], [[PHI10]], implicit $fpcr
+  ; CHECK-NEXT:   $d0 = COPY [[FMULDrr]]
+  ; CHECK-NEXT:   RET_ReallyLR implicit $d0
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64 = COPY $x0
+    %11:gpr64 = COPY $x1
+    %20:fpr64 = FMOVDi 1
+
+  bb.1:
+    %12:gpr64 = PHI %11, %bb.0, %13, %bb.1
+    %24:fpr64 = PHI %20, %bb.0, %21, %bb.1
+    dead $xzr = SUBSXrr %10, %12, implicit-def $nzcv
+    %13:gpr64 = ADDXrr %12, %11
+    %21:fpr64 = FADDDrr %20, %24, implicit $fpcr
+    %22:fpr64 = FADDDrr %21, %24, implicit $fpcr
+    %23:fpr64 = FADDDrr %22, %24, implicit $fpcr
+    Bcc 1, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %25:fpr64 = FMULDrr %21, %23, implicit $fpcr
+    $d0 = COPY %25
+    RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-mve7.mir b/llvm/test/CodeGen/AArch64/sms-mve7.mir
new file mode 100644
index 0000000000000..3b3d0ea09eaf0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-mve7.mir
@@ -0,0 +1,128 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg -pipeliner-force-ii=3 -mcpu=neoverse-n1 2>&1 | FileCheck %s
+
+# test pipeliner code genearation by MVE algorithm
+# compare instruction also updates the counter
+
+...
+---
+name:            func
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+  ; CHECK-NEXT:   [[FMOVDi:%[0-9]+]]:fpr64 = FMOVDi 1
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[SUBSXrr:%[0-9]+]]:gpr64 = SUBSXrr [[COPY]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[SUBSXrr1:%[0-9]+]]:gpr64 = SUBSXrr [[SUBSXrr]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr1:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr]], [[CSINCXr]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[SUBSXrr2:%[0-9]+]]:gpr64 = SUBSXrr [[SUBSXrr1]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr2:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr1]], [[CSINCXr1]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[SUBSXrr3:%[0-9]+]]:gpr64 = SUBSXrr [[SUBSXrr2]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr3:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr2]], [[CSINCXr2]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr3]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.4, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT:   [[SUBSXrr4:%[0-9]+]]:gpr64 = SUBSXrr [[COPY]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr1:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:fpr64 = PHI [[FADDDrr5:%[0-9]+]], %bb.5, [[FMOVDi]], %bb.4
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr64 = PHI [[SUBSXrr6:%[0-9]+]], %bb.5, [[COPY]], %bb.4
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:fpr64 = PHI [[FADDDrr8:%[0-9]+]], %bb.5, [[FADDDrr]], %bb.4
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:gpr64 = PHI [[SUBSXrr7:%[0-9]+]], %bb.5, [[SUBSXrr4]], %bb.4
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:fpr64 = PHI [[FADDDrr10:%[0-9]+]], %bb.5, [[FADDDrr1]], %bb.4
+  ; CHECK-NEXT:   [[FADDDrr2:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[SUBSXrr5:%[0-9]+]]:gpr64 = SUBSXrr [[PHI3]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr3:%[0-9]+]]:fpr64 = FADDDrr [[PHI4]], [[PHI]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr4:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr2]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr5]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[SUBSXrr6]]:gpr64 = SUBSXrr [[SUBSXrr5]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr6:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr4]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr7:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr5]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr8]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[SUBSXrr7]]:gpr64 = SUBSXrr [[SUBSXrr6]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr9:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr7]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr10]]:fpr64 = FADDDrr [[FADDDrr8]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[CSINCXr4:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[SUBSXrr8:%[0-9]+]]:gpr64 = SUBSXrr [[SUBSXrr7]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr5:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr4]], [[CSINCXr4]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[SUBSXrr9:%[0-9]+]]:gpr64 = SUBSXrr [[SUBSXrr8]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr6:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr5]], [[CSINCXr5]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr6]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.5, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr11:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr10]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[SUBSXrr10:%[0-9]+]]:gpr64 = SUBSXrr [[SUBSXrr6]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr7:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr7]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.7, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:fpr64 = PHI [[FMOVDi]], %bb.3, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:gpr64 = PHI [[COPY]], %bb.3, [[SUBSXrr7]], %bb.6
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:gpr64 = PHI [[PHI6]], %bb.7, [[SUBSXrr11:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:fpr64 = PHI [[PHI5]], %bb.7, [[FADDDrr12:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[SUBSXrr11]]:gpr64 = SUBSXrr [[PHI7]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr12]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr13:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr12]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr14:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr13]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   Bcc 1, %bb.1, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:fpr64 = PHI [[FADDDrr12]], %bb.1, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:fpr64 = PHI [[FADDDrr14]], %bb.1, [[FADDDrr11]], %bb.6
+  ; CHECK-NEXT:   [[FMULDrr:%[0-9]+]]:fpr64 = FMULDrr [[PHI9]], [[PHI10]], implicit $fpcr
+  ; CHECK-NEXT:   $d0 = COPY [[FMULDrr]]
+  ; CHECK-NEXT:   RET_ReallyLR implicit $d0
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64 = COPY $x0
+    %11:gpr64 = COPY $x1
+    %20:fpr64 = FMOVDi 1
+
+  bb.1:
+    %12:gpr64 = PHI %10, %bb.0, %13, %bb.1
+    %24:fpr64 = PHI %20, %bb.0, %21, %bb.1
+    %13:gpr64 = SUBSXrr %12, %11, implicit-def $nzcv
+    %21:fpr64 = FADDDrr %20, %24, implicit $fpcr
+    %22:fpr64 = FADDDrr %21, %24, implicit $fpcr
+    %23:fpr64 = FADDDrr %22, %24, implicit $fpcr
+    Bcc 1, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %25:fpr64 = FMULDrr %21, %23, implicit $fpcr
+    $d0 = COPY %25
+    RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-mve8.mir b/llvm/test/CodeGen/AArch64/sms-mve8.mir
new file mode 100644
index 0000000000000..c1ea6defac1fb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-mve8.mir
@@ -0,0 +1,138 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg -pipeliner-force-ii=3 -mcpu=neoverse-n1 2>&1 | FileCheck %s
+
+# test pipeliner code genearation by MVE algorithm
+# compare instruction also updates the counter
+
+...
+---
+name:            func
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+  ; CHECK-NEXT:   [[FMOVDi:%[0-9]+]]:fpr64 = FMOVDi 1
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY1]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr1:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr]], [[CSINCXr]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr1]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr2:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr1]], [[CSINCXr1]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr2]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr3:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr2]], [[CSINCXr2]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr3]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.4, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY1]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr3:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr1:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:fpr64 = PHI [[FADDDrr5:%[0-9]+]], %bb.5, [[FMOVDi]], %bb.4
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr64 = PHI [[ADDXrr5:%[0-9]+]], %bb.5, [[COPY1]], %bb.4
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:fpr64 = PHI [[FADDDrr8:%[0-9]+]], %bb.5, [[FADDDrr]], %bb.4
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:gpr64 = PHI [[ADDXrr6:%[0-9]+]], %bb.5, [[ADDXrr3]], %bb.4
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:fpr64 = PHI [[FADDDrr10:%[0-9]+]], %bb.5, [[FADDDrr1]], %bb.4
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[PHI3]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr2:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr4:%[0-9]+]]:gpr64 = ADDXrr [[PHI3]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr3:%[0-9]+]]:fpr64 = FADDDrr [[PHI4]], [[PHI]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr4:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr2]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr4]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr5]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr5]]:gpr64 = ADDXrr [[ADDXrr4]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr6:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr4]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr7:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr5]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr5]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr8]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr6]]:gpr64 = ADDXrr [[ADDXrr5]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr9:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr7]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr10]]:fpr64 = FADDDrr [[FADDDrr8]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[CSINCXr4:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr6]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr5:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr4]], [[CSINCXr4]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr7:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr6]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr7]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr6:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr5]], [[CSINCXr5]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr6]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.5, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr11:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr10]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr5]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr7:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr7]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.7, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:fpr64 = PHI [[FMOVDi]], %bb.3, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:gpr64 = PHI [[COPY1]], %bb.3, [[ADDXrr6]], %bb.6
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:gpr64 = PHI [[PHI6]], %bb.7, [[ADDXrr8:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:fpr64 = PHI [[PHI5]], %bb.7, [[FADDDrr12:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[PHI7]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[ADDXrr8]]:gpr64 = ADDXrr [[PHI7]], [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr12]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr13:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr12]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr14:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr13]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   Bcc 1, %bb.1, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:fpr64 = PHI [[FADDDrr12]], %bb.1, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:fpr64 = PHI [[FADDDrr14]], %bb.1, [[FADDDrr11]], %bb.6
+  ; CHECK-NEXT:   [[FMULDrr:%[0-9]+]]:fpr64 = FMULDrr [[PHI9]], [[PHI10]], implicit $fpcr
+  ; CHECK-NEXT:   $d0 = COPY [[FMULDrr]]
+  ; CHECK-NEXT:   RET_ReallyLR implicit $d0
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64 = COPY $x0
+    %11:gpr64 = COPY $x1
+    %20:fpr64 = FMOVDi 1
+
+  bb.1:
+    %12:gpr64 = PHI %11, %bb.0, %13, %bb.1
+    %24:fpr64 = PHI %20, %bb.0, %21, %bb.1
+    dead $xzr = SUBSXrr %12, %11, implicit-def $nzcv
+    %13:gpr64 = ADDXrr %12, %11
+    %21:fpr64 = FADDDrr %20, %24, implicit $fpcr
+    %22:fpr64 = FADDDrr %21, %24, implicit $fpcr
+    %23:fpr64 = FADDDrr %22, %24, implicit $fpcr
+    Bcc 1, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %25:fpr64 = FMULDrr %21, %23, implicit $fpcr
+    $d0 = COPY %25
+    RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-mve9.mir b/llvm/test/CodeGen/AArch64/sms-mve9.mir
new file mode 100644
index 0000000000000..469a9ecfff8ee
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-mve9.mir
@@ -0,0 +1,152 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg -pipeliner-force-ii=3 -mcpu=neoverse-n1 2>&1 | FileCheck %s
+
+# test pipeliner code genearation by MVE algorithm
+# COPY instructions exist
+
+...
+---
+name:            func
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+  ; CHECK-NEXT:   [[FMOVDi:%[0-9]+]]:fpr64 = FMOVDi 1
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY1]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr1:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr]], [[CSINCXr]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr1]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr2:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr1]], [[CSINCXr1]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr2]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr3:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr2]], [[CSINCXr2]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr3]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.4, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[FADDDrr:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr3:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY2]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr64sp = COPY [[ADDXrr3]]
+  ; CHECK-NEXT:   [[FADDDrr1:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:fpr64 = PHI [[FADDDrr5:%[0-9]+]], %bb.5, [[FMOVDi]], %bb.4
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr64sp = PHI [[COPY7:%[0-9]+]], %bb.5, [[COPY1]], %bb.4
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:gpr64 = PHI [[COPY8:%[0-9]+]], %bb.5, [[COPY2]], %bb.4
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:fpr64 = PHI [[FADDDrr8:%[0-9]+]], %bb.5, [[FADDDrr]], %bb.4
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:gpr64 = PHI [[ADDXrr6:%[0-9]+]], %bb.5, [[ADDXrr3]], %bb.4
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:gpr64sp = PHI [[COPY9:%[0-9]+]], %bb.5, [[COPY3]], %bb.4
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:fpr64 = PHI [[FADDDrr10:%[0-9]+]], %bb.5, [[FADDDrr1]], %bb.4
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr64 = COPY [[PHI5]]
+  ; CHECK-NEXT:   [[FADDDrr2:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI3]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr3:%[0-9]+]]:fpr64 = FADDDrr [[PHI6]], [[PHI]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr4:%[0-9]+]]:gpr64 = ADDXrr [[COPY4]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY4]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr64sp = COPY [[ADDXrr4]]
+  ; CHECK-NEXT:   [[FADDDrr4:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr2]], [[PHI3]], implicit $fpcr
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr64 = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[FADDDrr5]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr6:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr4]], [[PHI3]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr5:%[0-9]+]]:gpr64 = ADDXrr [[COPY6]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY6]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[COPY7]]:gpr64sp = COPY [[ADDXrr5]]
+  ; CHECK-NEXT:   [[FADDDrr7:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr5]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[COPY8]]:gpr64 = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[FADDDrr8]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr9:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr7]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr6]]:gpr64 = ADDXrr [[COPY8]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY8]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[COPY9]]:gpr64sp = COPY [[ADDXrr6]]
+  ; CHECK-NEXT:   [[FADDDrr10]]:fpr64 = FADDDrr [[FADDDrr8]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[CSINCXr4:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr6]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr5:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr4]], [[CSINCXr4]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr7:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr6]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr7]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr6:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr5]], [[CSINCXr5]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr6]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.5, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr11:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr10]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY8]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr7:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr7]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.7, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:fpr64 = PHI [[FMOVDi]], %bb.3, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:gpr64common = PHI [[COPY1]], %bb.3, [[COPY9]], %bb.6
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:gpr64sp = PHI [[COPY11:%[0-9]+]]8, %bb.7, [[COPY11]], %bb.1
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:fpr64 = PHI [[PHI7]], %bb.7, [[FADDDrr12:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:gpr64 = COPY [[PHI9]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[COPY10]], [[COPY1]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[ADDXrr8:%[0-9]+]]:gpr64 = ADDXrr [[COPY10]], [[COPY1]]
+  ; CHECK-NEXT:   [[COPY11]]:gpr64sp = COPY [[ADDXrr8]]
+  ; CHECK-NEXT:   [[FADDDrr12]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI10]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr13:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr12]], [[PHI10]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr14:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr13]], [[PHI10]], implicit $fpcr
+  ; CHECK-NEXT:   Bcc 1, %bb.1, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI11:%[0-9]+]]:fpr64 = PHI [[FADDDrr12]], %bb.1, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI12:%[0-9]+]]:fpr64 = PHI [[FADDDrr14]], %bb.1, [[FADDDrr11]], %bb.6
+  ; CHECK-NEXT:   [[FMULDrr:%[0-9]+]]:fpr64 = FMULDrr [[PHI11]], [[PHI12]], implicit $fpcr
+  ; CHECK-NEXT:   $d0 = COPY [[FMULDrr]]
+  ; CHECK-NEXT:   RET_ReallyLR implicit $d0
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64 = COPY $x0
+    %11:gpr64 = COPY $x1
+    %20:fpr64 = FMOVDi 1
+
+  bb.1:
+    %12:gpr64sp = PHI %11, %bb.0, %15, %bb.1
+    %24:fpr64 = PHI %20, %bb.0, %21, %bb.1
+    %14:gpr64 = COPY %12
+    dead $xzr = SUBSXrr %14, %11, implicit-def $nzcv
+    %13:gpr64 = ADDXrr %14, %11
+    %15:gpr64sp = COPY %13
+    %21:fpr64 = FADDDrr %20, %24, implicit $fpcr
+    %22:fpr64 = FADDDrr %21, %24, implicit $fpcr
+    %23:fpr64 = FADDDrr %22, %24, implicit $fpcr
+    Bcc 1, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %25:fpr64 = FMULDrr %21, %23, implicit $fpcr
+    $d0 = COPY %25
+    RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-unpipeline-insts2.mir b/llvm/test/CodeGen/AArch64/sms-unpipeline-insts2.mir
deleted file mode 100644
index c5b76d88ff00d..0000000000000
--- a/llvm/test/CodeGen/AArch64/sms-unpipeline-insts2.mir
+++ /dev/null
@@ -1,81 +0,0 @@
-# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-enable-copytophi=0 -debug-only=pipeliner 2>&1 | FileCheck %s
-# REQUIRES: asserts
-
-# An acceptable loop by pipeliner TBB == LoopBB, FBB == ExitBB, Compare and branch
-# CHECK: SU([[SU0:[0-9]+]]):   [[V0:%[0-9]+]]:gpr64common = SUBXri [[V1:%[0-9]+]]:gpr64common, 1, 0
-# CHECK: Do not pipeline SU([[SU0:[0-9]+]])
-
---- |
-  define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
-  entry:
-    %or.cond = icmp ult i32 %n, 2
-    br i1 %or.cond, label %for.end, label %for.body.preheader
-  
-  for.body.preheader:                               ; preds = %entry
-    %i.07 = add i32 %n, -1
-    %0 = sext i32 %i.07 to i64
-    br label %for.body
-  
-  for.body:                                         ; preds = %for.body.preheader, %for.body
-    %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-    %1 = shl nsw i64 %indvars.iv, 2
-    %scevgep = getelementptr i8, ptr %b, i64 %1
-    %2 = load float, ptr %scevgep, align 4
-    %add = fadd float %2, 1.000000e+00
-    %3 = shl nsw i64 %indvars.iv, 2
-    %scevgep11 = getelementptr i8, ptr %a, i64 %3
-    store float %add, ptr %scevgep11, align 4
-    %indvars.iv.next = add nsw i64 %indvars.iv, -1
-    %4 = add i64 %indvars.iv, -1
-    %5 = and i64 %4, 4294967295
-    %tobool.not = icmp eq i64 %5, 0
-    br i1 %tobool.not, label %for.end, label %for.body
-  
-  for.end:                                          ; preds = %for.body, %entry
-    ret void
-  }
-  
-...
----
-name:            func
-tracksRegLiveness: true
-liveins:
-  - { reg: '$x0', virtual-reg: '%3' }
-  - { reg: '$x1', virtual-reg: '%4' }
-  - { reg: '$w2', virtual-reg: '%5' }
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x1, $w2
-  
-    %5:gpr32common = COPY $w2
-    %4:gpr64common = COPY $x1
-    %3:gpr64common = COPY $x0
-    dead $wzr = SUBSWri %5, 2, 0, implicit-def $nzcv
-    Bcc 3, %bb.3, implicit $nzcv
-    B %bb.1
-  
-  bb.1.for.body.preheader:
-    %7:gpr32common = SUBWri %5, 1, 0
-    %9:gpr64all = IMPLICIT_DEF
-    %8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32
-    %10:gpr64 = SBFMXri killed %8, 0, 31
-    %0:gpr64all = COPY %10
-    %12:fpr32 = FMOVSi 112
-  
-  bb.2.for.body:
-    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
-  
-    %1:gpr64common = PHI %0, %bb.1, %2, %bb.2
-    %11:fpr32 = LDRSroX %4, %1, 0, 1 :: (load (s32) from %ir.scevgep)
-    %13:fpr32 = nofpexcept FADDSrr killed %11, %12, implicit $fpcr
-    STRSroX killed %13, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep11)
-    %14:gpr64common = SUBXri %1, 1, 0
-    %2:gpr64all = COPY %14
-    %15:gpr32 = COPY %14.sub_32
-    CBNZW killed %15, %bb.2
-    B %bb.3
-  
-  bb.3.for.end:
-    RET_ReallyLR
-
-...

>From 1ec55c8bbc9b7433dbe76d8aa03615754f01791e Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Fri, 12 Apr 2024 13:19:11 +0000
Subject: [PATCH 07/11] fixup! [ModuloSchedule] Implement modulo variable
 expansion for pipelining

---
 llvm/test/CodeGen/AArch64/sms-mve10.mir       | 140 +++++++++++++++++
 llvm/test/CodeGen/AArch64/sms-mve11.mir       | 140 +++++++++++++++++
 llvm/test/CodeGen/AArch64/sms-mve12.mir       | 142 ++++++++++++++++++
 .../AArch64/sms-unacceptable-loop3.mir        | 109 ++++++++++++++
 4 files changed, 531 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sms-mve10.mir
 create mode 100644 llvm/test/CodeGen/AArch64/sms-mve11.mir
 create mode 100644 llvm/test/CodeGen/AArch64/sms-mve12.mir
 create mode 100644 llvm/test/CodeGen/AArch64/sms-unacceptable-loop3.mir

diff --git a/llvm/test/CodeGen/AArch64/sms-mve10.mir b/llvm/test/CodeGen/AArch64/sms-mve10.mir
new file mode 100644
index 0000000000000..f94ad7c95bc14
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-mve10.mir
@@ -0,0 +1,140 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg -pipeliner-force-ii=3 -mcpu=neoverse-n1 2>&1 | FileCheck %s
+
+# test pipeliner code genearation by MVE algorithm
+# same as sms-mve1.mir except for the order of the operands
+
+...
+---
+name:            func
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+  ; CHECK-NEXT:   [[FMOVDi:%[0-9]+]]:fpr64 = FMOVDi 1
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[ADDXrr]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr1]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr1:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr]], [[CSINCXr]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[ADDXrr1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr2]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr2:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr1]], [[CSINCXr1]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr3:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[ADDXrr2]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr3]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr3:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr2]], [[CSINCXr2]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr3]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.4, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr4:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[COPY1]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr4]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr1:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:fpr64 = PHI [[FADDDrr5:%[0-9]+]], %bb.5, [[FMOVDi]], %bb.4
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr64 = PHI [[ADDXrr6:%[0-9]+]], %bb.5, [[COPY1]], %bb.4
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:fpr64 = PHI [[FADDDrr8:%[0-9]+]], %bb.5, [[FADDDrr]], %bb.4
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:gpr64 = PHI [[ADDXrr7:%[0-9]+]], %bb.5, [[ADDXrr4]], %bb.4
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:fpr64 = PHI [[FADDDrr10:%[0-9]+]], %bb.5, [[FADDDrr1]], %bb.4
+  ; CHECK-NEXT:   [[FADDDrr2:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr5:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[PHI3]]
+  ; CHECK-NEXT:   [[FADDDrr3:%[0-9]+]]:fpr64 = FADDDrr [[PHI4]], [[PHI]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr5]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr4:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr2]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr5]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr6]]:gpr64 = ADDXrr [[COPY1]], [[ADDXrr5]]
+  ; CHECK-NEXT:   [[FADDDrr6:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr4]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr6]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr7:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr5]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr8]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr7]]:gpr64 = ADDXrr [[COPY1]], [[ADDXrr6]]
+  ; CHECK-NEXT:   [[FADDDrr9:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr7]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr7]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr10]]:fpr64 = FADDDrr [[FADDDrr8]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[CSINCXr4:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr8:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[ADDXrr7]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr8]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr5:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr4]], [[CSINCXr4]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr9:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[ADDXrr8]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr9]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr6:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr5]], [[CSINCXr5]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr6]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.5, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr11:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr10]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr7]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr7:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr7]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.7, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:fpr64 = PHI [[FMOVDi]], %bb.3, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:gpr64 = PHI [[COPY1]], %bb.3, [[ADDXrr7]], %bb.6
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:gpr64 = PHI [[PHI6]], %bb.7, [[ADDXrr10:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:fpr64 = PHI [[PHI5]], %bb.7, [[FADDDrr12:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[ADDXrr10]]:gpr64 = ADDXrr [[COPY1]], [[PHI7]]
+  ; CHECK-NEXT:   dead $xzr = SUBSXrr [[ADDXrr10]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr12]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr13:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr12]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr14:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr13]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   Bcc 1, %bb.1, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:fpr64 = PHI [[FADDDrr12]], %bb.1, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:fpr64 = PHI [[FADDDrr14]], %bb.1, [[FADDDrr11]], %bb.6
+  ; CHECK-NEXT:   [[FMULDrr:%[0-9]+]]:fpr64 = FMULDrr [[PHI9]], [[PHI10]], implicit $fpcr
+  ; CHECK-NEXT:   $d0 = COPY [[FMULDrr]]
+  ; CHECK-NEXT:   RET_ReallyLR implicit $d0
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64 = COPY $x0
+    %11:gpr64 = COPY $x1
+    %20:fpr64 = FMOVDi 1
+
+  bb.1:
+    %12:gpr64 = PHI %11, %bb.0, %13, %bb.1
+    %24:fpr64 = PHI %20, %bb.0, %21, %bb.1
+    %13:gpr64 = ADDXrr %11, %12
+    dead $xzr = SUBSXrr %13, %10, implicit-def $nzcv
+    %21:fpr64 = FADDDrr %20, %24, implicit $fpcr
+    %22:fpr64 = FADDDrr %21, %24, implicit $fpcr
+    %23:fpr64 = FADDDrr %22, %24, implicit $fpcr
+    Bcc 1, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %25:fpr64 = FMULDrr %21, %23, implicit $fpcr
+    $d0 = COPY %25
+    RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-mve11.mir b/llvm/test/CodeGen/AArch64/sms-mve11.mir
new file mode 100644
index 0000000000000..ab8cff858c2e6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-mve11.mir
@@ -0,0 +1,140 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg -pipeliner-force-ii=3 -mcpu=neoverse-n1 2>&1 | FileCheck %s
+
+# test pipeliner code genearation by MVE algorithm
+# counter increment/compare instruction with immediate operand
+
+...
+---
+name:            func
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+  ; CHECK-NEXT:   [[FMOVDi:%[0-9]+]]:fpr64 = FMOVDi 1
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[COPY1]], 1, 0
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[ADDXri]], 20, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXri1:%[0-9]+]]:gpr64common = ADDXri [[ADDXri]], 1, 0
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[ADDXri1]], 20, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr1:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr]], [[CSINCXr]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXri2:%[0-9]+]]:gpr64common = ADDXri [[ADDXri1]], 1, 0
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[ADDXri2]], 20, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr2:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr1]], [[CSINCXr1]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXri3:%[0-9]+]]:gpr64common = ADDXri [[ADDXri2]], 1, 0
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[ADDXri3]], 20, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr3:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr2]], [[CSINCXr2]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr3]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.4, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXri4:%[0-9]+]]:gpr64common = ADDXri [[COPY1]], 1, 0
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[ADDXri4]], 20, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr1:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:fpr64 = PHI [[FADDDrr5:%[0-9]+]], %bb.5, [[FMOVDi]], %bb.4
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr64common = PHI [[ADDXri6:%[0-9]+]], %bb.5, [[COPY1]], %bb.4
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:fpr64 = PHI [[FADDDrr8:%[0-9]+]], %bb.5, [[FADDDrr]], %bb.4
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:gpr64common = PHI [[ADDXri7:%[0-9]+]], %bb.5, [[ADDXri4]], %bb.4
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:fpr64 = PHI [[FADDDrr10:%[0-9]+]], %bb.5, [[FADDDrr1]], %bb.4
+  ; CHECK-NEXT:   [[FADDDrr2:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXri5:%[0-9]+]]:gpr64common = ADDXri [[PHI3]], 1, 0
+  ; CHECK-NEXT:   [[FADDDrr3:%[0-9]+]]:fpr64 = FADDDrr [[PHI4]], [[PHI]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[ADDXri5]], 20, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr4:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr2]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr5]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXri6]]:gpr64common = ADDXri [[ADDXri5]], 1, 0
+  ; CHECK-NEXT:   [[FADDDrr6:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr4]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[ADDXri6]], 20, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr7:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr5]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr8]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXri7]]:gpr64common = ADDXri [[ADDXri6]], 1, 0
+  ; CHECK-NEXT:   [[FADDDrr9:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr7]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[ADDXri7]], 20, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr10]]:fpr64 = FADDDrr [[FADDDrr8]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[CSINCXr4:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXri8:%[0-9]+]]:gpr64common = ADDXri [[ADDXri7]], 1, 0
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[ADDXri8]], 20, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr5:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr4]], [[CSINCXr4]], 1, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXri9:%[0-9]+]]:gpr64common = ADDXri [[ADDXri8]], 1, 0
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[ADDXri9]], 20, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr6:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr5]], [[CSINCXr5]], 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr6]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.5, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr11:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr10]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[ADDXri7]], 20, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr7:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 1, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr7]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.7, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:fpr64 = PHI [[FMOVDi]], %bb.3, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:gpr64common = PHI [[COPY1]], %bb.3, [[ADDXri7]], %bb.6
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:gpr64common = PHI [[PHI6]], %bb.7, [[ADDXri10:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:fpr64 = PHI [[PHI5]], %bb.7, [[FADDDrr12:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[ADDXri10]]:gpr64common = ADDXri [[PHI7]], 1, 0
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[ADDXri10]], 20, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr12]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr13:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr12]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr14:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr13]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   Bcc 1, %bb.1, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:fpr64 = PHI [[FADDDrr12]], %bb.1, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:fpr64 = PHI [[FADDDrr14]], %bb.1, [[FADDDrr11]], %bb.6
+  ; CHECK-NEXT:   [[FMULDrr:%[0-9]+]]:fpr64 = FMULDrr [[PHI9]], [[PHI10]], implicit $fpcr
+  ; CHECK-NEXT:   $d0 = COPY [[FMULDrr]]
+  ; CHECK-NEXT:   RET_ReallyLR implicit $d0
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64common = COPY $x0
+    %11:gpr64common = COPY $x1
+    %20:fpr64 = FMOVDi 1
+
+  bb.1:
+    %12:gpr64common = PHI %11, %bb.0, %13, %bb.1
+    %24:fpr64 = PHI %20, %bb.0, %21, %bb.1
+    %13:gpr64common = ADDXri %12, 1, 0
+    dead $xzr = SUBSXri %13, 20, 0, implicit-def $nzcv
+    %21:fpr64 = FADDDrr %20, %24, implicit $fpcr
+    %22:fpr64 = FADDDrr %21, %24, implicit $fpcr
+    %23:fpr64 = FADDDrr %22, %24, implicit $fpcr
+    Bcc 1, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %25:fpr64 = FMULDrr %21, %23, implicit $fpcr
+    $d0 = COPY %25
+    RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-mve12.mir b/llvm/test/CodeGen/AArch64/sms-mve12.mir
new file mode 100644
index 0000000000000..6fce7b5fb9a8e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-mve12.mir
@@ -0,0 +1,142 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg -pipeliner-force-ii=3 -mcpu=neoverse-v1 2>&1 | FileCheck %s
+
+# test pipeliner code genearation by MVE algorithm
+# the compare instruction is whilexx
+
+...
+---
+name:            func
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+  ; CHECK-NEXT:   [[FMOVDi:%[0-9]+]]:fpr64 = FMOVDi 1
+  ; CHECK-NEXT:   [[CNTD_XPiI:%[0-9]+]]:gpr64common = CNTD_XPiI 31, 1, implicit $vg
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[CNTD_XPiI]]
+  ; CHECK-NEXT:   dead [[WHILELO_PXX_D:%[0-9]+]]:ppr = WHILELO_PXX_D [[ADDXrr]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 4, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr1:%[0-9]+]]:gpr64common = ADDXrr [[ADDXrr]], [[CNTD_XPiI]]
+  ; CHECK-NEXT:   dead [[WHILELO_PXX_D1:%[0-9]+]]:ppr = WHILELO_PXX_D [[ADDXrr1]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr1:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr]], [[CSINCXr]], 4, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr2:%[0-9]+]]:gpr64common = ADDXrr [[ADDXrr1]], [[CNTD_XPiI]]
+  ; CHECK-NEXT:   dead [[WHILELO_PXX_D2:%[0-9]+]]:ppr = WHILELO_PXX_D [[ADDXrr2]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr2:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr1]], [[CSINCXr1]], 4, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr3:%[0-9]+]]:gpr64common = ADDXrr [[ADDXrr2]], [[CNTD_XPiI]]
+  ; CHECK-NEXT:   dead [[WHILELO_PXX_D3:%[0-9]+]]:ppr = WHILELO_PXX_D [[ADDXrr3]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr3:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr2]], [[CSINCXr2]], 4, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr3]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.4, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr4:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[CNTD_XPiI]]
+  ; CHECK-NEXT:   dead [[WHILELO_PXX_D4:%[0-9]+]]:ppr = WHILELO_PXX_D [[ADDXrr4]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr1:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr]], [[FMOVDi]], implicit $fpcr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:fpr64 = PHI [[FADDDrr5:%[0-9]+]], %bb.5, [[FMOVDi]], %bb.4
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr64common = PHI [[ADDXrr6:%[0-9]+]], %bb.5, [[COPY1]], %bb.4
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:fpr64 = PHI [[FADDDrr8:%[0-9]+]], %bb.5, [[FADDDrr]], %bb.4
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:gpr64common = PHI [[ADDXrr7:%[0-9]+]], %bb.5, [[ADDXrr4]], %bb.4
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:fpr64 = PHI [[FADDDrr10:%[0-9]+]], %bb.5, [[FADDDrr1]], %bb.4
+  ; CHECK-NEXT:   [[FADDDrr2:%[0-9]+]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr5:%[0-9]+]]:gpr64common = ADDXrr [[PHI3]], [[CNTD_XPiI]]
+  ; CHECK-NEXT:   [[FADDDrr3:%[0-9]+]]:fpr64 = FADDDrr [[PHI4]], [[PHI]], implicit $fpcr
+  ; CHECK-NEXT:   dead [[WHILELO_PXX_D5:%[0-9]+]]:ppr = WHILELO_PXX_D [[ADDXrr5]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr4:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr2]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr5]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr6]]:gpr64common = ADDXrr [[ADDXrr5]], [[CNTD_XPiI]]
+  ; CHECK-NEXT:   [[FADDDrr6:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr4]], [[PHI2]], implicit $fpcr
+  ; CHECK-NEXT:   dead [[WHILELO_PXX_D6:%[0-9]+]]:ppr = WHILELO_PXX_D [[ADDXrr6]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr7:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr5]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr8]]:fpr64 = FADDDrr [[FMOVDi]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[ADDXrr7]]:gpr64common = ADDXrr [[ADDXrr6]], [[CNTD_XPiI]]
+  ; CHECK-NEXT:   [[FADDDrr9:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr7]], [[FADDDrr2]], implicit $fpcr
+  ; CHECK-NEXT:   dead [[WHILELO_PXX_D7:%[0-9]+]]:ppr = WHILELO_PXX_D [[ADDXrr7]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr10]]:fpr64 = FADDDrr [[FADDDrr8]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   [[CSINCXr4:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 4, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr8:%[0-9]+]]:gpr64common = ADDXrr [[ADDXrr7]], [[CNTD_XPiI]]
+  ; CHECK-NEXT:   dead [[WHILELO_PXX_D8:%[0-9]+]]:ppr = WHILELO_PXX_D [[ADDXrr8]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr5:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr4]], [[CSINCXr4]], 4, implicit $nzcv
+  ; CHECK-NEXT:   [[ADDXrr9:%[0-9]+]]:gpr64common = ADDXrr [[ADDXrr8]], [[CNTD_XPiI]]
+  ; CHECK-NEXT:   dead [[WHILELO_PXX_D9:%[0-9]+]]:ppr = WHILELO_PXX_D [[ADDXrr9]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr6:%[0-9]+]]:gpr64common = CSINCXr [[CSINCXr5]], [[CSINCXr5]], 4, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr6]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.5, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[FADDDrr11:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr10]], [[FADDDrr5]], implicit $fpcr
+  ; CHECK-NEXT:   dead [[WHILELO_PXX_D10:%[0-9]+]]:ppr = WHILELO_PXX_D [[ADDXrr7]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[CSINCXr7:%[0-9]+]]:gpr64common = CSINCXr $xzr, $xzr, 4, implicit $nzcv
+  ; CHECK-NEXT:   dead $xzr = SUBSXri [[CSINCXr7]], 0, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.7, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:fpr64 = PHI [[FMOVDi]], %bb.3, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:gpr64common = PHI [[COPY1]], %bb.3, [[ADDXrr7]], %bb.6
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:gpr64common = PHI [[PHI6]], %bb.7, [[ADDXrr10:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:fpr64 = PHI [[PHI5]], %bb.7, [[FADDDrr12:%[0-9]+]], %bb.1
+  ; CHECK-NEXT:   [[ADDXrr10]]:gpr64common = ADDXrr [[PHI7]], [[CNTD_XPiI]]
+  ; CHECK-NEXT:   dead [[WHILELO_PXX_D11:%[0-9]+]]:ppr = WHILELO_PXX_D [[ADDXrr10]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[FADDDrr12]]:fpr64 = FADDDrr [[FMOVDi]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr13:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr12]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   [[FADDDrr14:%[0-9]+]]:fpr64 = FADDDrr [[FADDDrr13]], [[PHI8]], implicit $fpcr
+  ; CHECK-NEXT:   Bcc 4, %bb.1, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:fpr64 = PHI [[FADDDrr12]], %bb.1, [[FADDDrr8]], %bb.6
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:fpr64 = PHI [[FADDDrr14]], %bb.1, [[FADDDrr11]], %bb.6
+  ; CHECK-NEXT:   [[FMULDrr:%[0-9]+]]:fpr64 = FMULDrr [[PHI9]], [[PHI10]], implicit $fpcr
+  ; CHECK-NEXT:   $d0 = COPY [[FMULDrr]]
+  ; CHECK-NEXT:   RET_ReallyLR implicit $d0
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64common = COPY $x0
+    %11:gpr64common = COPY $x1
+    %20:fpr64 = FMOVDi 1
+    %26:gpr64common = CNTD_XPiI 31, 1, implicit $vg
+
+  bb.1:
+    %12:gpr64common = PHI %11, %bb.0, %13, %bb.1
+    %24:fpr64 = PHI %20, %bb.0, %21, %bb.1
+    %13:gpr64common = ADDXrr %12, %26
+    dead %30:ppr = WHILELO_PXX_D %13, %10, implicit-def $nzcv
+    %21:fpr64 = FADDDrr %20, %24, implicit $fpcr
+    %22:fpr64 = FADDDrr %21, %24, implicit $fpcr
+    %23:fpr64 = FADDDrr %22, %24, implicit $fpcr
+    Bcc 4, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %25:fpr64 = FMULDrr %21, %23, implicit $fpcr
+    $d0 = COPY %25
+    RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-unacceptable-loop3.mir b/llvm/test/CodeGen/AArch64/sms-unacceptable-loop3.mir
new file mode 100644
index 0000000000000..e6d86859a41b9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-unacceptable-loop3.mir
@@ -0,0 +1,109 @@
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -debug-only=pipeliner 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# unacceptable loops by pipeliner
+
+...
+---
+name:            func1
+tracksRegLiveness: true
+body:             |
+  ; multiple counter increment instructions
+  ; CHECK: Unable to analyzeLoop, can NOT pipeline Loop
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64 = COPY $x0
+    %11:gpr64 = COPY $x1
+
+  bb.1:
+    %12:gpr64 = PHI %11, %bb.0, %13, %bb.1
+    %13:gpr64 = ADDXrr %12, %11
+    %14:gpr64 = ADDXrr %13, %11
+    dead $xzr = SUBSXrr %10, %14, implicit-def $nzcv
+    Bcc 1, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+...
+---
+name:            func2
+tracksRegLiveness: true
+body:             |
+  ; neither operand of the increment instruction is a loop invariant value
+  ; CHECK: Unable to analyzeLoop, can NOT pipeline Loop
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64 = COPY $x0
+    %11:gpr64 = COPY $x1
+
+  bb.1:
+    %12:gpr64 = PHI %11, %bb.0, %13, %bb.1
+    %13:gpr64 = ADDXrr %12, %12
+    dead $xzr = SUBSXrr %10, %13, implicit-def $nzcv
+    Bcc 1, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+...
+---
+name:            func3
+tracksRegLiveness: true
+body:             |
+  ; neither operand of the compare instruction is a loop invariant value
+  ; CHECK: Unable to analyzeLoop, can NOT pipeline Loop
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64 = COPY $x0
+    %11:gpr64 = COPY $x1
+
+  bb.1:
+    %12:gpr64 = PHI %11, %bb.0, %13, %bb.1
+    %13:gpr64 = ADDXrr %12, %11
+    dead $xzr = SUBSXrr %13, %13, implicit-def $nzcv
+    Bcc 1, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+...
+---
+name:            func4
+tracksRegLiveness: true
+body:             |
+  ; multiple phi instructions
+  ; CHECK: Unable to analyzeLoop, can NOT pipeline Loop
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64 = COPY $x0
+    %11:gpr64 = COPY $x1
+
+  bb.1:
+    %12:gpr64 = PHI %11, %bb.0, %13, %bb.1
+    %14:gpr64 = PHI %11, %bb.0, %15, %bb.1
+    dead $xzr = SUBSXrr %12, %10, implicit-def $nzcv
+    %13:gpr64 = ADDXrr %14, %11
+    %15:gpr64 = ADDXrr %12, %11
+    Bcc 1, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+...
+---
+name:            func5
+tracksRegLiveness: true
+body:             |
+  ; not an increment instruction
+  ; CHECK: Unable to analyzeLoop, can NOT pipeline Loop
+  bb.0.entry:
+    liveins: $x0, $x1
+    %10:gpr64 = COPY $x0
+    %11:gpr64 = COPY $x1
+
+  bb.1:
+    %12:gpr64 = PHI %11, %bb.0, %13, %bb.1
+    %13:gpr64 = ORRXrr %12, %12
+    dead $xzr = SUBSXrr %12, %10, implicit-def $nzcv
+    Bcc 1, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+...

>From 8c80fe98751b362cd7f4929184cc5ba580e08dc8 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Tue, 7 May 2024 15:07:23 +0000
Subject: [PATCH 08/11] fixup! [ModuloSchedule] Implement modulo variable
 expansion for pipelining

---
 llvm/include/llvm/CodeGen/ModuloSchedule.h   |  5 ++
 llvm/include/llvm/CodeGen/TargetInstrInfo.h  | 10 ++--
 llvm/lib/CodeGen/ModuloSchedule.cpp          | 55 +++++++++++++-------
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp |  4 +-
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp     |  9 ----
 llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp |  9 ----
 llvm/lib/Target/PowerPC/PPCInstrInfo.cpp     |  9 ----
 7 files changed, 49 insertions(+), 52 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ModuloSchedule.h b/llvm/include/llvm/CodeGen/ModuloSchedule.h
index 0e6fd2dabc64f..e9f0f089adfef 100644
--- a/llvm/include/llvm/CodeGen/ModuloSchedule.h
+++ b/llvm/include/llvm/CodeGen/ModuloSchedule.h
@@ -427,6 +427,11 @@ class ModuloScheduleExpanderMVE {
                       SmallVectorImpl<ValueMapTy> &CurVRMap,
                       SmallVectorImpl<ValueMapTy> *PrevVRMap);
 
+  void insertCondBranch(MachineBasicBlock &MBB, int RequiredTC,
+                        InstrMapTy &LastStage0Insts,
+                        MachineBasicBlock &GreaterThan,
+                        MachineBasicBlock &Otherwise);
+
 public:
   ModuloScheduleExpanderMVE(MachineFunction &MF, ModuloSchedule &S,
                             LiveIntervals &LIS)
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index e978fb7bfbf31..b8484433bf8f4 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -768,7 +768,7 @@ class TargetInstrInfo : public MCInstrInfo {
 
     /// Create a condition to determine if the remaining trip count for a phase
     /// is greater than TC. Some instructions such as comparisons may be
-    /// inserted at the bottom of MBB. The all instructions expanded for the
+    /// inserted at the bottom of MBB. All instructions expanded for the
     /// phase must be inserted in MBB before calling this function.
     /// LastStage0Insts is the map from the original instructions scheduled at
     /// stage#0 to the expanded instructions for the last iteration of the
@@ -780,7 +780,11 @@ class TargetInstrInfo : public MCInstrInfo {
     /// value of the trip count.
     virtual void createRemainingIterationsGreaterCondition(
         int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
-        DenseMap<MachineInstr *, MachineInstr *> LastStage0Insts) = 0;
+        DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {
+      llvm_unreachable(
+          "Target didn't implement "
+          "PipelinerLoopInfo::createRemainingIterationsGreaterCondition!");
+    }
 
     /// Modify the loop such that the trip count is
     /// OriginalTC + TripCountAdjust.
@@ -798,7 +802,7 @@ class TargetInstrInfo : public MCInstrInfo {
 
     /// Return true if the target can expand pipelined schedule with modulo
     /// variable expansion.
-    virtual bool isMVEExpanderSupported() = 0;
+    virtual bool isMVEExpanderSupported() { return false; }
   };
 
   /// Analyze loop L, which must be a single-basic-block loop, and if the
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index b6527445e08e9..a0d551e7bd962 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -22,6 +22,10 @@
 #define DEBUG_TYPE "pipeliner"
 using namespace llvm;
 
+static cl::opt<bool> SwapBranchTargetsMVE(
+    "pipeliner-swap-branch-targets-mve", cl::Hidden, cl::init(false),
+    cl::desc("Swap target blocks of a conditional branch for MVE expander"));
+
 void ModuloSchedule::print(raw_ostream &OS) {
   for (MachineInstr *MI : ScheduledInstrs)
     OS << "[stage " << getStage(MI) << " @" << getCycle(MI) << "c] " << *MI;
@@ -2131,20 +2135,38 @@ static MachineBasicBlock *createDedicatedExit(MachineBasicBlock *Loop,
     llvm_unreachable("unexpected loop structure");
   TII->removeBranch(*Loop);
   TII->insertBranch(*Loop, TBB, FBB, Cond, DebugLoc());
-  Loop->removeSuccessor(Exit);
-  Loop->addSuccessor(NewExit);
+  Loop->replaceSuccessor(Exit, NewExit);
   TII->insertUnconditionalBranch(*NewExit, Exit, DebugLoc());
   NewExit->addSuccessor(Exit);
 
-  for (MachineInstr &Phi : Exit->phis()) {
-    for (MachineOperand &MO : Phi.operands())
-      if (MO.isMBB() && MO.getMBB() == Loop)
-        MO.setMBB(NewExit);
-  }
+  Exit->replacePhiUsesWith(Loop, NewExit);
 
   return NewExit;
 }
 
+/// Insert branch code into the end of MBB. It branches to GreaterThan if the
+/// remaining trip count for instructions in LastStage0Insts is greater than
+/// RequiredTC, and to Otherwise otherwise.
+void ModuloScheduleExpanderMVE::insertCondBranch(MachineBasicBlock &MBB,
+                                                 int RequiredTC,
+                                                 InstrMapTy &LastStage0Insts,
+                                                 MachineBasicBlock &GreaterThan,
+                                                 MachineBasicBlock &Otherwise) {
+  SmallVector<MachineOperand, 4> Cond;
+  LoopInfo->createRemainingIterationsGreaterCondition(RequiredTC, MBB, Cond,
+                                                      LastStage0Insts);
+
+  if (SwapBranchTargetsMVE) {
+    // Set SwapBranchTargetsMVE to true if a target prefers to replace TBB and
+    // FBB for optimal performance.
+    if (TII->reverseBranchCondition(Cond))
+      llvm_unreachable("can not reverse branch condition");
+    TII->insertBranch(MBB, &Otherwise, &GreaterThan, Cond, DebugLoc());
+  } else {
+    TII->insertBranch(MBB, &GreaterThan, &Otherwise, Cond, DebugLoc());
+  }
+}
+
 /// Generate a pipelined loop that is unrolled by using MVE algorithm and any
 /// other necessary blocks. The control flow is modified to execute the
 /// pipelined loop if the trip count satisfies the condition, otherwise the
@@ -2261,15 +2283,13 @@ void ModuloScheduleExpanderMVE::generatePipelinedLoop() {
   Epilog->addSuccessor(NewPreheader);
   Epilog->addSuccessor(NewExit);
 
-  SmallVector<MachineOperand, 4> Cond;
-  LoopInfo->createRemainingIterationsGreaterCondition(
-      Schedule.getNumStages() + NumUnroll - 2, *Check, Cond, InstrMapTy());
-  TII->insertBranch(*Check, Prolog, NewPreheader, Cond, DebugLoc());
+  InstrMapTy LastStage0Insts;
+  insertCondBranch(*Check, Schedule.getNumStages() + NumUnroll - 2,
+                   LastStage0Insts, *Prolog, *NewPreheader);
 
   // VRMaps map (prolog/kernel/epilog phase#, original register#) to new
   // register#
   SmallVector<ValueMapTy> PrologVRMap, KernelVRMap, EpilogVRMap;
-  InstrMapTy LastStage0Insts;
   generateProlog(PrologVRMap);
   generateKernel(PrologVRMap, KernelVRMap, LastStage0Insts);
   generateEpilog(KernelVRMap, EpilogVRMap, LastStage0Insts);
@@ -2554,10 +2574,8 @@ void ModuloScheduleExpanderMVE::generateKernel(
   }
 
   // If remaining trip count is greater than NumUnroll-1, loop continues
-  SmallVector<MachineOperand, 4> Cond;
-  LoopInfo->createRemainingIterationsGreaterCondition(NumUnroll - 1, *NewKernel,
-                                                      Cond, LastStage0Insts);
-  TII->insertBranch(*NewKernel, NewKernel, Epilog, Cond, DebugLoc());
+  insertCondBranch(*NewKernel, NumUnroll - 1, LastStage0Insts, *NewKernel,
+                   *Epilog);
 
   LLVM_DEBUG({
     dbgs() << "kernel:\n";
@@ -2597,10 +2615,7 @@ void ModuloScheduleExpanderMVE::generateEpilog(
   // Instructions related to loop control, such as loop counter comparison,
   // are indicated by shouldIgnoreForPipelining() and are assumed to be placed
   // in stage 0. Thus, the map is for the last one in the kernel.
-  SmallVector<MachineOperand, 4> Cond;
-  LoopInfo->createRemainingIterationsGreaterCondition(0, *Epilog, Cond,
-                                                      LastStage0Insts);
-  TII->insertBranch(*Epilog, NewPreheader, NewExit, Cond, DebugLoc());
+  insertCondBranch(*Epilog, 0, LastStage0Insts, *NewPreheader, *NewExit);
 
   LLVM_DEBUG({
     dbgs() << "epilog:\n";
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 5fb868a6ee5f1..f514dcf054301 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -9630,7 +9630,7 @@ class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
 
   void createRemainingIterationsGreaterCondition(
       int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
-      DenseMap<MachineInstr *, MachineInstr *> LastStage0Insts) override;
+      DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
 
   void setPreheader(MachineBasicBlock *NewPreheader) override {}
 
@@ -9671,7 +9671,7 @@ static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
 
 void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
     int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
-    DenseMap<MachineInstr *, MachineInstr *> LastStage0Insts) {
+    DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {
   // Create and accumulate conditions for next TC iterations.
   // Example:
   //   SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index a8e590d59298f..5d0468948dfb6 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -6823,20 +6823,11 @@ class ARMPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
       llvm_unreachable("Unknown EndLoop");
   }
 
-  void createRemainingIterationsGreaterCondition(
-      int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
-      DenseMap<MachineInstr *, MachineInstr *> LastStage0Insts) override {
-    llvm_unreachable(
-        "Target didn't implement createRemainingIterationsGreaterCondition");
-  }
-
   void setPreheader(MachineBasicBlock *NewPreheader) override {}
 
   void adjustTripCount(int TripCountAdjust) override {}
 
   void disposed() override {}
-
-  bool isMVEExpanderSupported() override { return false; }
 };
 
 void ARMPipelinerLoopInfo::bumpCrossIterationPressure(RegPressureTracker &RPT,
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index c65d4099373f4..b9bf26ba7cca1 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -770,13 +770,6 @@ class HexagonPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
     return TripCount > TC;
   }
 
-  void createRemainingIterationsGreaterCondition(
-      int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
-      DenseMap<MachineInstr *, MachineInstr *> LastStage0Insts) override {
-    llvm_unreachable(
-        "Target didn't implement createRemainingIterationsGreaterCondition");
-  }
-
   void setPreheader(MachineBasicBlock *NewPreheader) override {
     NewPreheader->splice(NewPreheader->getFirstTerminator(), Loop->getParent(),
                          Loop);
@@ -805,8 +798,6 @@ class HexagonPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
   }
 
   void disposed() override { Loop->eraseFromParent(); }
-
-  bool isMVEExpanderSupported() override { return false; }
 };
 } // namespace
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index bb1e298d9b2f7..5f5eb31a5a85f 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -5439,13 +5439,6 @@ class PPCPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
     return TripCount > TC;
   }
 
-  void createRemainingIterationsGreaterCondition(
-      int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
-      DenseMap<MachineInstr *, MachineInstr *> LastStage0Insts) override {
-    llvm_unreachable(
-        "Target didn't implement createRemainingIterationsGreaterCondition");
-  }
-
   void setPreheader(MachineBasicBlock *NewPreheader) override {
     // Do nothing. We want the LOOP setup instruction to stay in the *old*
     // preheader, so we can use BDZ in the prologs to adapt the loop trip count.
@@ -5470,8 +5463,6 @@ class PPCPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
     // Ensure the loop setup instruction is deleted too.
     LoopCount->eraseFromParent();
   }
-
-  bool isMVEExpanderSupported() override { return false; }
 };
 } // namespace
 

>From 594e3deb4c4501cfac6ebde266474f92226976c0 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Tue, 7 May 2024 15:33:00 +0000
Subject: [PATCH 09/11] fixup! [ModuloSchedule] Implement modulo variable
 expansion for pipelining

---
 llvm/lib/CodeGen/ModuloSchedule.cpp          | 2 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index a0d551e7bd962..93b29e4c3e1f4 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -2170,7 +2170,7 @@ void ModuloScheduleExpanderMVE::insertCondBranch(MachineBasicBlock &MBB,
 /// Generate a pipelined loop that is unrolled by using MVE algorithm and any
 /// other necessary blocks. The control flow is modified to execute the
 /// pipelined loop if the trip count satisfies the condition, otherwise the
-/// original loop. The original loop is also used to execute the reminder
+/// original loop. The original loop is also used to execute the remainder
 /// iterations which occur due to unrolling.
 void ModuloScheduleExpanderMVE::generatePipelinedLoop() {
   // The control flow for pipelining with MVE:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index f514dcf054301..15d65ac530c59 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -9763,7 +9763,7 @@ void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
     }
   }
 
-  // If AccCond == 0, the reminder is greater than TC.
+  // If AccCond == 0, the remainder is greater than TC.
   BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
       .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
       .addReg(AccCond)

>From 004574d8fbe851ed10f651ff7e99919f40b0ff95 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Thu, 6 Jun 2024 12:14:01 +0000
Subject: [PATCH 10/11] fixup! [ModuloSchedule] Implement modulo variable
 expansion for pipelining

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 36 ++++----
 .../CodeGen/AArch64/sms-acceptable-loop3.mir  | 83 +++++++++++++++++++
 .../CodeGen/AArch64/sms-acceptable-loop4.mir  | 83 +++++++++++++++++++
 3 files changed, 184 insertions(+), 18 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir
 create mode 100644 llvm/test/CodeGen/AArch64/sms-acceptable-loop4.mir

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 15d65ac530c59..f444167a45152 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -9774,15 +9774,15 @@ void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
 }
 
 static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
-                          Register *RegMBB, Register *RegOther) {
+                          Register &RegMBB, Register &RegOther) {
   assert(Phi.getNumOperands() == 5);
   if (Phi.getOperand(2).getMBB() == MBB) {
-    *RegMBB = Phi.getOperand(1).getReg();
-    *RegOther = Phi.getOperand(3).getReg();
+    RegMBB = Phi.getOperand(1).getReg();
+    RegOther = Phi.getOperand(3).getReg();
   } else {
     assert(Phi.getOperand(4).getMBB() == MBB);
-    *RegMBB = Phi.getOperand(3).getReg();
-    *RegOther = Phi.getOperand(1).getReg();
+    RegMBB = Phi.getOperand(3).getReg();
+    RegOther = Phi.getOperand(1).getReg();
   }
 }
 
@@ -9796,17 +9796,17 @@ static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {
 /// If Reg is an induction variable, return true and set some parameters
 static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
                           MachineInstr *&UpdateInst,
-                          unsigned *UpdateCounterOprNum, Register *InitReg,
-                          bool *IsUpdatePriorComp) {
+                          unsigned &UpdateCounterOprNum, Register &InitReg,
+                          bool &IsUpdatePriorComp) {
   if (LoopBB->pred_size() != 2)
     return false;
   if (!Reg.isVirtual())
     return false;
   const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
   UpdateInst = nullptr;
-  *UpdateCounterOprNum = 0;
-  *InitReg = 0;
-  *IsUpdatePriorComp = true;
+  UpdateCounterOprNum = 0;
+  InitReg = 0;
+  IsUpdatePriorComp = true;
   Register CurReg = Reg;
   while (true) {
     MachineInstr *Def = MRI.getVRegDef(CurReg);
@@ -9815,11 +9815,11 @@ static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
     if (Def->isCopy()) {
       CurReg = Def->getOperand(1).getReg();
     } else if (Def->isPHI()) {
-      if (*InitReg != 0)
+      if (InitReg != 0)
         return false;
       if (!UpdateInst)
-        *IsUpdatePriorComp = false;
-      extractPhiReg(*Def, LoopBB, &CurReg, InitReg);
+        IsUpdatePriorComp = false;
+      extractPhiReg(*Def, LoopBB, CurReg, InitReg);
     } else {
       if (UpdateInst)
         return false;
@@ -9833,7 +9833,7 @@ static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
       case AArch64::SUBXri:
       case AArch64::SUBWri:
         UpdateInst = Def;
-        *UpdateCounterOprNum = 1;
+        UpdateCounterOprNum = 1;
         break;
       case AArch64::ADDSXrr:
       case AArch64::ADDSWrr:
@@ -9845,16 +9845,16 @@ static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
       case AArch64::SUBWrr:
         UpdateInst = Def;
         if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
-          *UpdateCounterOprNum = 1;
+          UpdateCounterOprNum = 1;
         else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
-          *UpdateCounterOprNum = 2;
+          UpdateCounterOprNum = 2;
         else
           return false;
         break;
       default:
         return false;
       }
-      CurReg = Def->getOperand(*UpdateCounterOprNum).getReg();
+      CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
     }
 
     if (!CurReg.isVirtual())
@@ -9952,7 +9952,7 @@ AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
   bool IsUpdatePriorComp;
   unsigned UpdateCounterOprNum;
   if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
-                     Update, &UpdateCounterOprNum, &Init, &IsUpdatePriorComp))
+                     Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
     return nullptr;
 
   return std::make_unique<AArch64PipelinerLoopInfo>(
diff --git a/llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir b/llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir
new file mode 100644
index 0000000000000..a7ae15ee45985
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir
@@ -0,0 +1,83 @@
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-enable-copytophi=0 -debug-only=pipeliner 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# UNSUPPORTED: target={{.*}}
+# Compare and branch instructions are not supported now.
+
+# An acceptable loop by pipeliner: TBB == ExitBB, FBB == LoopBB, Compare and branch
+# CHECK: Schedule Found? 1
+
+--- |
+  define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
+  entry:
+    %or.cond = icmp ult i32 %n, 2
+    br i1 %or.cond, label %for.end, label %for.body.preheader
+  
+  for.body.preheader:                               ; preds = %entry
+    %i.07 = add i32 %n, -1
+    %0 = sext i32 %i.07 to i64
+    br label %for.body
+  
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+    %1 = shl nsw i64 %indvars.iv, 2
+    %scevgep = getelementptr i8, ptr %b, i64 %1
+    %2 = load float, ptr %scevgep, align 4
+    %add = fadd float %2, 1.000000e+00
+    %3 = shl nsw i64 %indvars.iv, 2
+    %scevgep11 = getelementptr i8, ptr %a, i64 %3
+    store float %add, ptr %scevgep11, align 4
+    %indvars.iv.next = add nsw i64 %indvars.iv, -1
+    %4 = add i64 %indvars.iv, -1
+    %5 = and i64 %4, 4294967295
+    %tobool.not = icmp eq i64 %5, 0
+    br i1 %tobool.not, label %for.end, label %for.body
+  
+  for.end:                                          ; preds = %for.body, %entry
+    ret void
+  }
+  
+...
+---
+name:            func
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0', virtual-reg: '%3' }
+  - { reg: '$x1', virtual-reg: '%4' }
+  - { reg: '$w2', virtual-reg: '%5' }
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $w2
+  
+    %5:gpr32common = COPY $w2
+    %4:gpr64common = COPY $x1
+    %3:gpr64common = COPY $x0
+    dead $wzr = SUBSWri %5, 2, 0, implicit-def $nzcv
+    Bcc 3, %bb.3, implicit $nzcv
+    B %bb.1
+  
+  bb.1.for.body.preheader:
+    %7:gpr32common = SUBWri %5, 1, 0
+    %9:gpr64all = IMPLICIT_DEF
+    %8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32
+    %10:gpr64 = SBFMXri killed %8, 0, 31
+    %0:gpr64all = COPY %10
+    %12:fpr32 = FMOVSi 112
+  
+  bb.2.for.body:
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+  
+    %1:gpr64common = PHI %0, %bb.1, %2, %bb.2
+    %11:fpr32 = LDRSroX %4, %1, 0, 1 :: (load (s32) from %ir.scevgep)
+    %13:fpr32 = nofpexcept FADDSrr killed %11, %12, implicit $fpcr
+    STRSroX killed %13, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep11)
+    %14:gpr64common = SUBXri %1, 1, 0
+    %2:gpr64all = COPY %14
+    %15:gpr32 = COPY %14.sub_32
+    CBZW killed %15, %bb.3
+    B %bb.2
+  
+  bb.3.for.end:
+    RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-acceptable-loop4.mir b/llvm/test/CodeGen/AArch64/sms-acceptable-loop4.mir
new file mode 100644
index 0000000000000..c4064eac58125
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-acceptable-loop4.mir
@@ -0,0 +1,83 @@
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-enable-copytophi=0 -debug-only=pipeliner 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# UNSUPPORTED: target={{.*}}
+# Compare and branch instructions are not supported now.
+
+# An acceptable loop by pipeliner TBB == LoopBB, FBB == ExitBB, Compare and branch
+# CHECK: Schedule Found? 1
+
+--- |
+  define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
+  entry:
+    %or.cond = icmp ult i32 %n, 2
+    br i1 %or.cond, label %for.end, label %for.body.preheader
+  
+  for.body.preheader:                               ; preds = %entry
+    %i.07 = add i32 %n, -1
+    %0 = sext i32 %i.07 to i64
+    br label %for.body
+  
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+    %1 = shl nsw i64 %indvars.iv, 2
+    %scevgep = getelementptr i8, ptr %b, i64 %1
+    %2 = load float, ptr %scevgep, align 4
+    %add = fadd float %2, 1.000000e+00
+    %3 = shl nsw i64 %indvars.iv, 2
+    %scevgep11 = getelementptr i8, ptr %a, i64 %3
+    store float %add, ptr %scevgep11, align 4
+    %indvars.iv.next = add nsw i64 %indvars.iv, -1
+    %4 = add i64 %indvars.iv, -1
+    %5 = and i64 %4, 4294967295
+    %tobool.not = icmp eq i64 %5, 0
+    br i1 %tobool.not, label %for.end, label %for.body
+  
+  for.end:                                          ; preds = %for.body, %entry
+    ret void
+  }
+  
+...
+---
+name:            func
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0', virtual-reg: '%3' }
+  - { reg: '$x1', virtual-reg: '%4' }
+  - { reg: '$w2', virtual-reg: '%5' }
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $w2
+  
+    %5:gpr32common = COPY $w2
+    %4:gpr64common = COPY $x1
+    %3:gpr64common = COPY $x0
+    dead $wzr = SUBSWri %5, 2, 0, implicit-def $nzcv
+    Bcc 3, %bb.3, implicit $nzcv
+    B %bb.1
+  
+  bb.1.for.body.preheader:
+    %7:gpr32common = SUBWri %5, 1, 0
+    %9:gpr64all = IMPLICIT_DEF
+    %8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32
+    %10:gpr64 = SBFMXri killed %8, 0, 31
+    %0:gpr64all = COPY %10
+    %12:fpr32 = FMOVSi 112
+  
+  bb.2.for.body:
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+  
+    %1:gpr64common = PHI %0, %bb.1, %2, %bb.2
+    %11:fpr32 = LDRSroX %4, %1, 0, 1 :: (load (s32) from %ir.scevgep)
+    %13:fpr32 = nofpexcept FADDSrr killed %11, %12, implicit $fpcr
+    STRSroX killed %13, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep11)
+    %14:gpr64common = SUBXri %1, 1, 0
+    %2:gpr64all = COPY %14
+    %15:gpr32 = COPY %14.sub_32
+    CBNZW killed %15, %bb.2
+    B %bb.3
+  
+  bb.3.for.end:
+    RET_ReallyLR
+
+...

>From 5831275d2d5749d1c394bd684b1910cfeb50303c Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Thu, 6 Jun 2024 12:59:08 +0000
Subject: [PATCH 11/11] fixup! [ModuloSchedule] Implement modulo variable
 expansion for pipelining

---
 .../CodeGen/AArch64/sms-unpipeline-insts2.mir | 84 +++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sms-unpipeline-insts2.mir

diff --git a/llvm/test/CodeGen/AArch64/sms-unpipeline-insts2.mir b/llvm/test/CodeGen/AArch64/sms-unpipeline-insts2.mir
new file mode 100644
index 0000000000000..80e67397cc8f8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-unpipeline-insts2.mir
@@ -0,0 +1,84 @@
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-enable-copytophi=0 -debug-only=pipeliner 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# UNSUPPORTED: target={{.*}}
+# Compare and branch instructions are not supported now.
+
+# An acceptable loop by pipeliner TBB == LoopBB, FBB == ExitBB, Compare and branch
+# CHECK: SU([[SU0:[0-9]+]]):   [[V0:%[0-9]+]]:gpr64common = SUBXri [[V1:%[0-9]+]]:gpr64common, 1, 0
+# CHECK: Do not pipeline SU([[SU0:[0-9]+]])
+
+--- |
+  define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
+  entry:
+    %or.cond = icmp ult i32 %n, 2
+    br i1 %or.cond, label %for.end, label %for.body.preheader
+  
+  for.body.preheader:                               ; preds = %entry
+    %i.07 = add i32 %n, -1
+    %0 = sext i32 %i.07 to i64
+    br label %for.body
+  
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+    %1 = shl nsw i64 %indvars.iv, 2
+    %scevgep = getelementptr i8, ptr %b, i64 %1
+    %2 = load float, ptr %scevgep, align 4
+    %add = fadd float %2, 1.000000e+00
+    %3 = shl nsw i64 %indvars.iv, 2
+    %scevgep11 = getelementptr i8, ptr %a, i64 %3
+    store float %add, ptr %scevgep11, align 4
+    %indvars.iv.next = add nsw i64 %indvars.iv, -1
+    %4 = add i64 %indvars.iv, -1
+    %5 = and i64 %4, 4294967295
+    %tobool.not = icmp eq i64 %5, 0
+    br i1 %tobool.not, label %for.end, label %for.body
+  
+  for.end:                                          ; preds = %for.body, %entry
+    ret void
+  }
+  
+...
+---
+name:            func
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0', virtual-reg: '%3' }
+  - { reg: '$x1', virtual-reg: '%4' }
+  - { reg: '$w2', virtual-reg: '%5' }
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $w2
+  
+    %5:gpr32common = COPY $w2
+    %4:gpr64common = COPY $x1
+    %3:gpr64common = COPY $x0
+    dead $wzr = SUBSWri %5, 2, 0, implicit-def $nzcv
+    Bcc 3, %bb.3, implicit $nzcv
+    B %bb.1
+  
+  bb.1.for.body.preheader:
+    %7:gpr32common = SUBWri %5, 1, 0
+    %9:gpr64all = IMPLICIT_DEF
+    %8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32
+    %10:gpr64 = SBFMXri killed %8, 0, 31
+    %0:gpr64all = COPY %10
+    %12:fpr32 = FMOVSi 112
+  
+  bb.2.for.body:
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+  
+    %1:gpr64common = PHI %0, %bb.1, %2, %bb.2
+    %11:fpr32 = LDRSroX %4, %1, 0, 1 :: (load (s32) from %ir.scevgep)
+    %13:fpr32 = nofpexcept FADDSrr killed %11, %12, implicit $fpcr
+    STRSroX killed %13, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep11)
+    %14:gpr64common = SUBXri %1, 1, 0
+    %2:gpr64all = COPY %14
+    %15:gpr32 = COPY %14.sub_32
+    CBNZW killed %15, %bb.2
+    B %bb.3
+  
+  bb.3.for.end:
+    RET_ReallyLR
+
+...



More information about the llvm-commits mailing list