[llvm] [ModuloSchedule] Implement modulo variable expansion for pipelining (PR #65609)

Fri Sep 22 06:58:27 PDT 2023

llvmbot wrote:




@llvm/pr-subscribers-backend-arm

<details>
<summary>Changes</summary>

Modulo variable expansion is a technique that resolves overlap of variable lifetimes by unrolling. The existing implementation solves it by making a copy by move instruction for processors with ordinary registers such as Arm and x86. This method may result in a very large number of move instructions, which can cause performance problems.

Modulo variable expansion is enabled by specifying -pipeliner-mve-cg. A backend must implement some newly defined interfaces in PipelinerLoopInfo.

Discourse thread: https://discourse.llvm.org/t/implementing-modulo-variable-expansion-for-machinepipeliner


---

Patch is 32.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/65609.diff


7 Files Affected:

- (modified) llvm/include/llvm/CodeGen/ModuloSchedule.h (+65) 
- (modified) llvm/include/llvm/CodeGen/TargetInstrInfo.h (+17) 
- (modified) llvm/lib/CodeGen/MachinePipeliner.cpp (+9) 
- (modified) llvm/lib/CodeGen/ModuloSchedule.cpp (+619) 
- (modified) llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp (+9) 
- (modified) llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp (+9) 
- (modified) llvm/lib/Target/PowerPC/PPCInstrInfo.cpp (+9) 


``````````diff

diff --git a/llvm/include/llvm/CodeGen/ModuloSchedule.h b/llvm/include/llvm/CodeGen/ModuloSchedule.h
index d03f7b4959159e4..8aa0a1a81186ad6 100644
--- a/llvm/include/llvm/CodeGen/ModuloSchedule.h
+++ b/llvm/include/llvm/CodeGen/ModuloSchedule.h
@@ -369,6 +369,71 @@ class PeelingModuloScheduleExpander {
   std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
 };
 
+/// Expand the kernel using modulo variable expansion algorithm (MVE).
+/// It unrolls the kernel enough to avoid overlap of register lifetime.
+class ModuloScheduleExpanderMVE {
+private:
+  using ValueMapTy = DenseMap<unsigned, unsigned>;
+  using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
+  using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
+
+  ModuloSchedule &Schedule;
+  MachineFunction &MF;
+  const TargetSubtargetInfo &ST;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo *TII = nullptr;
+  LiveIntervals &LIS;
+
+  MachineBasicBlock *OrigKernel = nullptr;
+  MachineBasicBlock *OrigPreheader = nullptr;
+  MachineBasicBlock *OrigExit = nullptr;
+  MachineBasicBlock *Check = nullptr;
+  MachineBasicBlock *Prolog = nullptr;
+  MachineBasicBlock *NewKernel = nullptr;
+  MachineBasicBlock *Epilog = nullptr;
+  MachineBasicBlock *NewPreheader = nullptr;
+  MachineBasicBlock *NewExit = nullptr;
+  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
+
+  /// The number of unroll required to avoid overlap of live ranges.
+  /// NumUnroll = 1 means no unrolling.
+  int NumUnroll;
+
+  void calcNumUnroll();
+  void generatePipelinedLoop();
+  void generateProlog(SmallVectorImpl<ValueMapTy> &VRMap);
+  void generatePhi(MachineInstr *OrigMI, int UnrollNum,
+                   SmallVectorImpl<ValueMapTy> &PrologVRMap,
+                   SmallVectorImpl<ValueMapTy> &KernelVRMap,
+                   SmallVectorImpl<ValueMapTy> &PhiVRMap);
+  void generateKernel(SmallVectorImpl<ValueMapTy> &PrologVRMap,
+                      SmallVectorImpl<ValueMapTy> &KernelVRMap);
+  void generateEpilog(SmallVectorImpl<ValueMapTy> &KernelVRMap,
+                      SmallVectorImpl<ValueMapTy> &EpilogVRMap);
+  void mergeRegUsesAfterPipeline(Register OrigReg, Register NewReg);
+
+  MachineInstr *cloneInstr(MachineInstr *OldMI);
+
+  void updateInstrDef(MachineInstr *NewMI, ValueMapTy &VRMap, bool LastDef);
+
+  void generateKernelPhi(Register OrigLoopVal, Register NewLoopVal,
+                         unsigned UnrollNum,
+                         SmallVectorImpl<ValueMapTy> &VRMapProlog,
+                         SmallVectorImpl<ValueMapTy> &VRMapPhi);
+  void updateInstrUse(MachineInstr *MI, int StageNum, int PhaseNum,
+                      SmallVectorImpl<ValueMapTy> &CurVRMap,
+                      SmallVectorImpl<ValueMapTy> *PrevVRMap);
+
+public:
+  ModuloScheduleExpanderMVE(MachineFunction &MF, ModuloSchedule &S,
+                            LiveIntervals &LIS)
+      : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
+        TII(ST.getInstrInfo()), LIS(LIS) {}
+
+  void expand();
+  static bool canApply(MachineLoop &L);
+};
+
 /// Expander that simply annotates each scheduled instruction with a post-instr
 /// symbol that can be consumed by the ModuloScheduleTest pass.
 ///
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 1c2ca8678346472..5344960b711cdf9 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -748,6 +748,19 @@ class TargetInstrInfo : public MCInstrInfo {
     createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
                                     SmallVectorImpl<MachineOperand> &Cond) = 0;
 
+    /// Create a condition to determine if the remaining trip count for a phase
+    /// is greater than TC. Some instructions such as comparisons may be
+    /// inserted at the bottom of MBB. The all instructions expanded for the
+    /// phase must be inserted in MBB before calling this function. RegMap is
+    /// the map from the original registers to the expanded registers for the
+    /// phase.
+    ///
+    /// MBB can also be a predecessor of the prologue block. Then RegMap must be
+    /// empty and the compared value is the initial value of the trip count.
+    virtual void createRemainingIterationsGreaterCondition(
+        int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
+        DenseMap<unsigned, unsigned> RegMap) = 0;
+
     /// Modify the loop such that the trip count is
     /// OriginalTC + TripCountAdjust.
     virtual void adjustTripCount(int TripCountAdjust) = 0;
@@ -761,6 +774,10 @@ class TargetInstrInfo : public MCInstrInfo {
     /// Once this function is called, no other functions on this object are
     /// valid; the loop has been removed.
     virtual void disposed() = 0;
+
+    /// Return true if the target can expand pipelined schedule with modulo
+    /// variable expansion.
+    virtual bool isMVEExpanderSupported() = 0;
   };
 
   /// Analyze loop L, which must be a single-basic-block loop, and if the
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 788ff5b3b5acdfc..3242dcc855672ca 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -174,6 +174,10 @@ static cl::opt<bool> ExperimentalCodeGen(
     cl::desc(
         "Use the experimental peeling code generator for software pipelining"));
 
+static cl::opt<bool>
+    MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
+               cl::desc("Use the MVE code generator for software pipelining"));
+
 namespace llvm {
 
 // A command line option to enable the CopyToPhi DAG mutation.
@@ -659,6 +663,11 @@ void SwingSchedulerDAG::schedule() {
   if (ExperimentalCodeGen && NewInstrChanges.empty()) {
     PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
     MSE.expand();
+  } else if (MVECodeGen && NewInstrChanges.empty() &&
+             LoopPipelinerInfo->isMVEExpanderSupported() &&
+             ModuloScheduleExpanderMVE::canApply(Loop)) {
+    ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
+    MSE.expand();
   } else {
     ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
     MSE.expand();
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 0bef513342ff123..6a2d141e8f0f7cc 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -2096,6 +2096,625 @@ void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
   MSE.cleanup();
 }
 
+MachineInstr *ModuloScheduleExpanderMVE::cloneInstr(MachineInstr *OldMI) {
+  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
+
+  // TODO: Offset information needs to be corrected.
+  NewMI->dropMemRefs(MF);
+
+  return NewMI;
+}
+
+/// Create a dedicated exit for Loop. Exit is the original exit for Loop.
+/// If it is already dedicated exit, return it. Otherwise, insert a new
+/// block between them and return the new block.
+static MachineBasicBlock *createDedicatedExit(MachineBasicBlock *Loop,
+                                              MachineBasicBlock *Exit) {
+  if (Exit->pred_size() == 1)
+    return Exit;
+
+  MachineFunction *MF = Loop->getParent();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+  MachineBasicBlock *NewExit =
+      MF->CreateMachineBasicBlock(Loop->getBasicBlock());
+  MF->insert(Loop->getIterator(), NewExit);
+
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  TII->analyzeBranch(*Loop, TBB, FBB, Cond);
+  if (TBB == Loop)
+    FBB = NewExit;
+  else if (FBB == Loop)
+    TBB = NewExit;
+  else
+    llvm_unreachable("unexpected loop structure");
+  TII->removeBranch(*Loop);
+  TII->insertBranch(*Loop, TBB, FBB, Cond, DebugLoc());
+  Loop->removeSuccessor(Exit);
+  Loop->addSuccessor(NewExit);
+  TII->insertUnconditionalBranch(*NewExit, Exit, DebugLoc());
+  NewExit->addSuccessor(Exit);
+
+  for (MachineInstr &Phi : Exit->phis()) {
+    for (MachineOperand &MO : Phi.operands())
+      if (MO.isMBB() && MO.getMBB() == Loop)
+        MO.setMBB(NewExit);
+  }
+
+  return NewExit;
+}
+
+/// Generate a pipelined loop that is unrolled by using MVE algorithm and any
+/// other necessary blocks. The control flow is modified to execute the
+/// pipelined loop if the trip count satisfies the condition, otherwise the
+/// original loop. The original loop is also used to execute the reminder
+/// iterations which occur due to unrolling.
+void ModuloScheduleExpanderMVE::generatePipelinedLoop() {
+  // The control flow for pipelining with MVE:
+  //
+  // OrigPreheader:
+  //   // The block that is originally the loop preheader
+  //   goto Check
+  //
+  // Check:
+  //   // Check whether the trip count satisfies the requirements to pipeline.
+  //   if (LoopCounter > NumStages + NumUnroll - 2)
+  //     // The minimum number of iterations to pipeline =
+  //     //   iterations executed in prolog/epilog (NumStages-1) +
+  //     //   iterations executed in one kernel run (NumUnroll)
+  //     goto Prolog
+  //   // fallback to the original loop
+  //   goto NewPreheader
+  //
+  // Prolog:
+  //   // All prolog stages. There are no direct branches to the epilogue.
+  //   goto NewKernel
+  //
+  // NewKernel:
+  //   // NumUnroll copies of the kernel
+  //   if (LoopCounter > MVE-1)
+  //     goto NewKernel
+  //   goto Epilog
+  //
+  // Epilog:
+  //   // All epilog stages.
+  //   if (LoopCounter > 0)
+  //     // The remainder is executed in the original loop
+  //     goto NewPreheader
+  //   goto NewExit
+  //
+  // NewPreheader:
+  //   // Newly created preheader for the original loop.
+  //   // The initial values of the phis in the loop are merged from two paths.
+  //   NewInitVal = Phi OrigInitVal, Check, PipelineLastVal, Epilog
+  //   goto OrigKernel
+  //
+  // OrigKernel:
+  //   // The original loop block.
+  //   if (LoopCounter != 0)
+  //     goto OrigKernel
+  //   goto NewExit
+  //
+  // NewExit:
+  //   // Newly created dedicated exit for the original loop.
+  //   // Merge values which are referenced after the loop
+  //   Merged = Phi OrigVal, OrigKernel, PipelineVal, Epilog
+  //   goto OrigExit
+  //
+  // OrigExit:
+  //   // The block that is originally the loop exit.
+  //   // If it is already deicated exit, NewExit is not created.
+
+  // An example of where each stage is executed:
+  // Assume #Stages 3, #MVE 4, #Iterations 12
+  // Iter   0 1 2 3 4 5 6 7 8 9 10-11
+  // -------------------------------------------------
+  // Stage  0                          Prolog#0
+  // Stage  1 0                        Prolog#1
+  // Stage  2 1 0                      Kernel Unroll#0 Iter#0
+  // Stage    2 1 0                    Kernel Unroll#1 Iter#0
+  // Stage      2 1 0                  Kernel Unroll#2 Iter#0
+  // Stage        2 1 0                Kernel Unroll#3 Iter#0
+  // Stage          2 1 0              Kernel Unroll#0 Iter#1
+  // Stage            2 1 0            Kernel Unroll#1 Iter#1
+  // Stage              2 1 0          Kernel Unroll#2 Iter#1
+  // Stage                2 1 0        Kernel Unroll#3 Iter#1
+  // Stage                  2 1        Epilog#0
+  // Stage                    2        Epilog#1
+  // Stage                      0-2    OrigKernel
+
+  LoopInfo = TII->analyzeLoopForPipelining(OrigKernel);
+  assert(LoopInfo && "Must be able to analyze loop!");
+
+  calcNumUnroll();
+
+  Check = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
+  Prolog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
+  NewKernel = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
+  Epilog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
+  NewPreheader = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
+
+  MF.insert(OrigKernel->getIterator(), Check);
+  MF.insert(OrigKernel->getIterator(), Prolog);
+  MF.insert(OrigKernel->getIterator(), NewKernel);
+  MF.insert(OrigKernel->getIterator(), Epilog);
+  MF.insert(OrigKernel->getIterator(), NewPreheader);
+
+  NewExit = createDedicatedExit(OrigKernel, OrigExit);
+
+  NewPreheader->transferSuccessorsAndUpdatePHIs(OrigPreheader);
+  TII->insertUnconditionalBranch(*NewPreheader, OrigKernel, DebugLoc());
+
+  OrigPreheader->addSuccessor(Check);
+  TII->removeBranch(*OrigPreheader);
+  TII->insertUnconditionalBranch(*OrigPreheader, Check, DebugLoc());
+
+  Check->addSuccessor(Prolog);
+  Check->addSuccessor(NewPreheader);
+
+  Prolog->addSuccessor(NewKernel);
+
+  NewKernel->addSuccessor(NewKernel);
+  NewKernel->addSuccessor(Epilog);
+
+  Epilog->addSuccessor(NewPreheader);
+  Epilog->addSuccessor(NewExit);
+
+  SmallVector<MachineOperand, 4> Cond;
+  LoopInfo->createRemainingIterationsGreaterCondition(
+      Schedule.getNumStages() + NumUnroll - 2, *Check, Cond, ValueMapTy());
+  TII->insertBranch(*Check, Prolog, NewPreheader, Cond, DebugLoc());
+
+  // VRMaps map (prolog/kernel/epilog phase#, original register#) to new
+  // register#
+  SmallVector<ValueMapTy> PrologVRMap, KernelVRMap, EpilogVRMap;
+  generateProlog(PrologVRMap);
+  generateKernel(PrologVRMap, KernelVRMap);
+  generateEpilog(KernelVRMap, EpilogVRMap);
+}
+
+/// Replace MI's use operands according to the maps.
+void ModuloScheduleExpanderMVE::updateInstrUse(
+    MachineInstr *MI, int StageNum, int PhaseNum,
+    SmallVectorImpl<ValueMapTy> &CurVRMap,
+    SmallVectorImpl<ValueMapTy> *PrevVRMap) {
+  // If MI is in the prolog/kernel/epilog block, CurVRMap is
+  // PrologVRMap/KernelVRMap/EpilogVRMap respectively.
+  // PrevVRMap is nullptr/PhiVRMap/KernelVRMap respectively.
+  // Refer to the appropriate map according to the stage difference between
+  // MI and the definition of an operand.
+
+  for (MachineOperand &UseMO : MI->uses()) {
+    if (!UseMO.isReg() || !UseMO.getReg().isVirtual())
+      continue;
+    int DiffStage = 0;
+    Register OrigReg = UseMO.getReg();
+    MachineInstr *DefInst = MRI.getVRegDef(OrigReg);
+    if (!DefInst || DefInst->getParent() != OrigKernel)
+      continue;
+    unsigned InitReg = 0;
+    unsigned DefReg = OrigReg;
+    if (DefInst->isPHI()) {
+      ++DiffStage;
+      unsigned LoopReg;
+      getPhiRegs(*DefInst, OrigKernel, InitReg, LoopReg);
+      // LoopReg is guaranteed to be defined within the loop by canApply()
+      DefReg = LoopReg;
+      DefInst = MRI.getVRegDef(LoopReg);
+    }
+    unsigned DefStageNum = Schedule.getStage(DefInst);
+    DiffStage += StageNum - DefStageNum;
+    Register NewReg;
+    if (PhaseNum >= DiffStage && CurVRMap[PhaseNum - DiffStage].count(DefReg))
+      // NewReg is defined in a previous phase of the same block
+      NewReg = CurVRMap[PhaseNum - DiffStage][DefReg];
+    else if (!PrevVRMap)
+      // Since this is the first iteration, refer the initial register of the
+      // loop
+      NewReg = InitReg;
+    else
+      // Cases where DiffStage is larger than PhaseNum.
+      // If MI is in the kernel block, the value is defined by the previous
+      // iteration and PhiVRMap is referenced. If MI is in the epilog block, the
+      // value is defined in the kernel block and KernelVRMap is referenced.
+      NewReg = (*PrevVRMap)[PrevVRMap->size() - (DiffStage - PhaseNum)][DefReg];
+
+    const TargetRegisterClass *NRC =
+        MRI.constrainRegClass(NewReg, MRI.getRegClass(OrigReg));
+    if (NRC)
+      UseMO.setReg(NewReg);
+    else {
+      Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
+      BuildMI(*OrigKernel, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY),
+              SplitReg)
+          .addReg(NewReg);
+      UseMO.setReg(SplitReg);
+    }
+  }
+}
+
+/// Return a phi if Reg is referenced by the phi.
+/// canApply() guarantees that at most only one such phi exists.
+static MachineInstr *getLoopPhiUser(Register Reg, MachineBasicBlock *Loop) {
+  for (MachineInstr &Phi : Loop->phis()) {
+    unsigned InitVal, LoopVal;
+    getPhiRegs(Phi, Loop, InitVal, LoopVal);
+    if (LoopVal == Reg)
+      return Φ
+  }
+  return nullptr;
+}
+
+/// Generate phis for registers defined by OrigMI.
+void ModuloScheduleExpanderMVE::generatePhi(
+    MachineInstr *OrigMI, int UnrollNum,
+    SmallVectorImpl<ValueMapTy> &PrologVRMap,
+    SmallVectorImpl<ValueMapTy> &KernelVRMap,
+    SmallVectorImpl<ValueMapTy> &PhiVRMap) {
+  int StageNum = Schedule.getStage(OrigMI);
+  bool UsePrologReg;
+  if (Schedule.getNumStages() - NumUnroll + UnrollNum - 1 >= StageNum)
+    UsePrologReg = true;
+  else if (Schedule.getNumStages() - NumUnroll + UnrollNum == StageNum)
+    UsePrologReg = false;
+  else
+    return;
+
+  // Examples that show which stages are merged by phi.
+  // Meaning of the symbol following the stage number:
+  //   a/b: Stages with the same letter are merged (UsePrologReg == true)
+  //   +: Merged with the initial value (UsePrologReg == false)
+  //   *: No phis required
+  //
+  // #Stages 3, #MVE 4
+  // Iter   0 1 2 3 4 5 6 7 8
+  // -----------------------------------------
+  // Stage  0a                 Prolog#0
+  // Stage  1a 0b              Prolog#1
+  // Stage  2* 1* 0*           Kernel Unroll#0
+  // Stage     2* 1* 0+        Kernel Unroll#1
+  // Stage        2* 1+ 0a     Kernel Unroll#2
+  // Stage           2+ 1a 0b  Kernel Unroll#3
+  //
+  // #Stages 3, #MVE 2
+  // Iter   0 1 2 3 4 5 6 7 8
+  // -----------------------------------------
+  // Stage  0a                 Prolog#0
+  // Stage  1a 0b              Prolog#1
+  // Stage  2* 1+ 0a           Kernel Unroll#0
+  // Stage     2+ 1a 0b        Kernel Unroll#1
+  //
+  // #Stages 3, #MVE 1
+  // Iter   0 1 2 3 4 5 6 7 8
+  // -----------------------------------------
+  // Stage  0*                 Prolog#0
+  // Stage  1a 0b              Prolog#1
+  // Stage  2+ 1a 0b           Kernel Unroll#0
+
+  for (MachineOperand &DefMO : OrigMI->defs()) {
+    if (!DefMO.isReg())
+      continue;
+    Register OrigReg = DefMO.getReg();
+    auto NewReg = KernelVRMap[UnrollNum].find(OrigReg);
+    if (NewReg == KernelVRMap[UnrollNum].end())
+      continue;
+    Register CorrespondReg;
+    if (UsePrologReg) {
+      int PrologNum = Schedule.getNumStages() - NumUnroll + UnrollNum - 1;
+      CorrespondReg = PrologVRMap[PrologNum][OrigReg];
+    } else {
+      MachineInstr *Phi = getLoopPhiUser(OrigReg, OrigKernel);
+      if (!Phi)
+        continue;
+      CorrespondReg = getInitPhiReg(*Phi, OrigKernel);
+    }
+
+    assert(CorrespondReg.isValid());
+    Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
+    BuildMI(*NewKernel, NewKernel->getFirstNonPHI(), DebugLoc(),
+            TII->get(TargetOpcode::PHI), PhiReg)
+        .addReg(NewReg->second)
+        .addMBB(NewKernel)
+        .addReg(CorrespondReg)
+        .addMBB(Prolog);
+    PhiVRMap[UnrollNum][OrigReg] = PhiReg;
+  }
+}
+
+static void replacePhiSrc(MachineInstr &Phi, Register OrigReg, Register NewReg,
+                          MachineBasicBlock *NewMBB) {
+  for (unsigned Idx = 1; Idx < Phi.getNumOperands(); Idx += 2) {
+    if (Phi.getOperand(Idx).getReg() == OrigReg) {
+      Phi.getOperand(Idx).setReg(NewReg);
+      Phi.getOperand(Idx + 1).setMBB(NewMBB);
+      return;
+    }
+  }
+}
+
+/// Generate phis that merge values from multiple routes
+void ModuloScheduleExpanderMVE::mergeRegUsesAfterPipeline(Register OrigReg,
+                                                          Register NewReg) {
+  SmallVector<MachineOperand *> UsesAfterLoop;
+  SmallVector<MachineInstr *> LoopPhis;
+  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(OrigReg),
+                                         E = MRI.use_end();
+       I != E; ++I) {
+    MachineOperand &O = *I;
+    if (O.getParent()->getParent() != OrigKernel &&
+        O.getParent()->getParent() != Prolog &&
+        O.getParent()->getParent() != NewKernel &&
+        O.getParent()->getParent() != Epilog)
+      UsesAfterLoop.push_back(&O);
+    if (O.getParent()->getParent() == OrigKernel && O.getParent()->isPHI())
+      LoopPhis.push_back(O.getParent());
+  }
+
+  // Merge the route that only execute the pipelined loop (when there are no
+  // remaining iterations) with the route that execute the original loop.
+  if (!UsesAfterLoop.empty()) {
+    Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
+    BuildMI(*NewExit, NewExit->getFirstNonPHI(), DebugLoc(),
+            TII...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/65609