[llvm] [LoopUnroll] Rotate loop before unrolling inside of UnrollRuntimeLoopRemainder (PR #148243)

Marek Sedláček via llvm-commits llvm-commits at lists.llvm.org
Tue Jul 15 06:24:36 PDT 2025


https://github.com/mark-sed updated https://github.com/llvm/llvm-project/pull/148243

>From c8bd1d29335974d9d8d72a1c152765d5ed3ca06a Mon Sep 17 00:00:00 2001
From: Marek Sedlacek <msedlacek at azul.com>
Date: Fri, 11 Jul 2025 13:22:03 +0000
Subject: [PATCH 1/3] Adds loop rotation to runtime loop unrolling, if this
 makes the loop computable, which then might enable additional unrolling of
 the loop.

To minimize the possibility of rotation without unrolling this rotation
is done right inside of UnrollRuntimeLoopRemainder.
---
 .../llvm/Transforms/Utils/LoopRotationUtils.h |  15 +-
 .../llvm/Transforms/Utils/UnrollLoop.h        |  19 +-
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp |   5 +-
 .../Transforms/Utils/LoopRotationUtils.cpp    |  28 +-
 llvm/lib/Transforms/Utils/LoopUnroll.cpp      | 130 +++++----
 .../lib/Transforms/Utils/LoopUnrollAndJam.cpp |  14 +-
 .../Transforms/Utils/LoopUnrollRuntime.cpp    |  48 +++-
 .../LoopUnroll/full-unroll-avoid-partial.ll   |   4 +-
 .../runtime-loop-multiexit-dom-verify.ll      | 272 ++++++++++++------
 .../LoopUnroll/runtime-unroll-after-rotate.ll | 106 +++++++
 10 files changed, 457 insertions(+), 184 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopUnroll/runtime-unroll-after-rotate.ll

diff --git a/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h b/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h
index c3643e0f27f94..1c83bd706ee3e 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TRANSFORMS_UTILS_LOOPROTATIONUTILS_H
 #define LLVM_TRANSFORMS_UTILS_LOOPROTATIONUTILS_H
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
@@ -32,12 +33,14 @@ class TargetTransformInfo;
 /// header. If the loop header's size exceeds the threshold, the loop rotation
 /// will give up. The flag IsUtilMode controls the heuristic used in the
 /// LoopRotation. If it is true, the profitability heuristic will be ignored.
-LLVM_ABI bool LoopRotation(Loop *L, LoopInfo *LI,
-                           const TargetTransformInfo *TTI, AssumptionCache *AC,
-                           DominatorTree *DT, ScalarEvolution *SE,
-                           MemorySSAUpdater *MSSAU, const SimplifyQuery &SQ,
-                           bool RotationOnly, unsigned Threshold,
-                           bool IsUtilMode, bool PrepareForLTO = false);
+/// The ProfitabilityCheck function can override general profitability check.
+LLVM_ABI bool LoopRotation(
+    Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI, AssumptionCache *AC,
+    DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
+    const SimplifyQuery &SQ, bool RotationOnly, unsigned Threshold,
+    bool IsUtilMode, bool PrepareForLTO = false,
+    function_ref<bool(Loop *, ScalarEvolution *)> ProfitabilityCheck =
+        [](Loop *, ScalarEvolution *) { return false; });
 
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index 765c613b04a44..4499b6f43ba19 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -54,11 +54,14 @@ LLVM_ABI const Loop *addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
                                               LoopInfo *LI,
                                               NewLoopsMap &NewLoops);
 
-/// Represents the result of a \c UnrollLoop invocation.
+/// Represents the result of a \c UnrollLoop and \c UnrollAndJamLoop invocation.
 enum class LoopUnrollResult {
   /// The loop was not modified.
   Unmodified,
 
+  /// The loop was modified, but not unrolled.
+  Modified,
+
   /// The loop was partially unrolled -- we still have a loop, but with a
   /// smaller trip count.  We may also have emitted epilogue loop if the loop
   /// had a non-constant trip count.
@@ -69,6 +72,18 @@ enum class LoopUnrollResult {
   FullyUnrolled
 };
 
+/// Represents the result of a \c UnrollRuntimeLoopRemainder invocation.
+enum class LoopReminderUnrollResult {
+  /// The loop reminder was not modified.
+  Unmodified,
+
+  /// The loop was rotated, but not unrolled.
+  Rotated,
+
+  /// The loop reminder was unrolled.
+  Unrolled
+};
+
 struct UnrollLoopOptions {
   unsigned Count;
   bool Force;
@@ -90,7 +105,7 @@ LLVM_ABI LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO,
                                      Loop **RemainderLoop = nullptr,
                                      AAResults *AA = nullptr);
 
-LLVM_ABI bool UnrollRuntimeLoopRemainder(
+LLVM_ABI LoopReminderUnrollResult UnrollRuntimeLoopRemainder(
     Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
     LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index a22d84dcf014d..8b1ab5a9e2181 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1357,8 +1357,9 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   ULO.RuntimeUnrollMultiExit = UP.RuntimeUnrollMultiExit;
   LoopUnrollResult UnrollResult = UnrollLoop(
       L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
-  if (UnrollResult == LoopUnrollResult::Unmodified)
-    return LoopUnrollResult::Unmodified;
+  if (UnrollResult == LoopUnrollResult::Unmodified ||
+      UnrollResult == LoopUnrollResult::Modified)
+    return UnrollResult;
 
   if (RemainderLoop) {
     std::optional<MDNode *> RemainderLoopID =
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 66d0573e83f65..3d93d8a1b7d4c 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -69,16 +69,19 @@ class LoopRotate {
   bool RotationOnly;
   bool IsUtilMode;
   bool PrepareForLTO;
+  function_ref<bool(Loop *, ScalarEvolution *)> ProfitabilityCheck;
 
 public:
   LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
              const TargetTransformInfo *TTI, AssumptionCache *AC,
              DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
              const SimplifyQuery &SQ, bool RotationOnly, bool IsUtilMode,
-             bool PrepareForLTO)
+             bool PrepareForLTO,
+             function_ref<bool(Loop *, ScalarEvolution *)> ProfitabilityCheck)
       : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
         MSSAU(MSSAU), SQ(SQ), RotationOnly(RotationOnly),
-        IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO) {}
+        IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO),
+        ProfitabilityCheck(ProfitabilityCheck) {}
   bool processLoop(Loop *L);
 
 private:
@@ -440,9 +443,9 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
 
     // Rotate if either the loop latch does *not* exit the loop, or if the loop
     // latch was just simplified. Or if we think it will be profitable.
-    if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
-        !profitableToRotateLoopExitingLatch(L) &&
-        !canRotateDeoptimizingLatchExit(L))
+    if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch &&
+        IsUtilMode == false && !profitableToRotateLoopExitingLatch(L) &&
+        !canRotateDeoptimizingLatchExit(L) && !ProfitabilityCheck(L, SE))
       return Rotated;
 
     // Check size of original header and reject loop if it is very big or we can't
@@ -1053,13 +1056,14 @@ bool LoopRotate::processLoop(Loop *L) {
 
 
 /// The utility to convert a loop into a loop with bottom test.
-bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
-                        AssumptionCache *AC, DominatorTree *DT,
-                        ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
-                        const SimplifyQuery &SQ, bool RotationOnly = true,
-                        unsigned Threshold = unsigned(-1),
-                        bool IsUtilMode = true, bool PrepareForLTO) {
+bool llvm::LoopRotation(
+    Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI, AssumptionCache *AC,
+    DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
+    const SimplifyQuery &SQ, bool RotationOnly = true,
+    unsigned Threshold = unsigned(-1), bool IsUtilMode = true,
+    bool PrepareForLTO,
+    function_ref<bool(Loop *, ScalarEvolution *)> ProfitabilityCheck) {
   LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, SQ, RotationOnly,
-                IsUtilMode, PrepareForLTO);
+                IsUtilMode, PrepareForLTO, ProfitabilityCheck);
   return LR.processLoop(L);
 }
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 86b268de43cf6..74662092a014f 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -486,12 +486,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
 
   // All these values should be taken only after peeling because they might have
   // changed.
-  BasicBlock *Preheader = L->getLoopPreheader();
-  BasicBlock *Header = L->getHeader();
   BasicBlock *LatchBlock = L->getLoopLatch();
-  SmallVector<BasicBlock *, 4> ExitBlocks;
-  L->getExitBlocks(ExitBlocks);
-  std::vector<BasicBlock *> OriginalLoopBlocks = L->getBlocks();
 
   const unsigned MaxTripCount = SE->getSmallConstantMaxTripCount(L);
   const bool MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L);
@@ -504,42 +499,6 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   if (MaxTripCount && ULO.Count > MaxTripCount)
     ULO.Count = MaxTripCount;
 
-  struct ExitInfo {
-    unsigned TripCount;
-    unsigned TripMultiple;
-    unsigned BreakoutTrip;
-    bool ExitOnTrue;
-    BasicBlock *FirstExitingBlock = nullptr;
-    SmallVector<BasicBlock *> ExitingBlocks;
-  };
-  DenseMap<BasicBlock *, ExitInfo> ExitInfos;
-  SmallVector<BasicBlock *, 4> ExitingBlocks;
-  L->getExitingBlocks(ExitingBlocks);
-  for (auto *ExitingBlock : ExitingBlocks) {
-    // The folding code is not prepared to deal with non-branch instructions
-    // right now.
-    auto *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
-    if (!BI)
-      continue;
-
-    ExitInfo &Info = ExitInfos[ExitingBlock];
-    Info.TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
-    Info.TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
-    if (Info.TripCount != 0) {
-      Info.BreakoutTrip = Info.TripCount % ULO.Count;
-      Info.TripMultiple = 0;
-    } else {
-      Info.BreakoutTrip = Info.TripMultiple =
-          (unsigned)std::gcd(ULO.Count, Info.TripMultiple);
-    }
-    Info.ExitOnTrue = !L->contains(BI->getSuccessor(0));
-    Info.ExitingBlocks.push_back(ExitingBlock);
-    LLVM_DEBUG(dbgs() << "  Exiting block %" << ExitingBlock->getName()
-                      << ": TripCount=" << Info.TripCount
-                      << ", TripMultiple=" << Info.TripMultiple
-                      << ", BreakoutTrip=" << Info.BreakoutTrip << "\n");
-  }
-
   // Are we eliminating the loop control altogether?  Note that we can know
   // we're eliminating the backedge without knowing exactly which iteration
   // of the unrolled body exits.
@@ -552,17 +511,6 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   if (CompletelyUnroll)
     ULO.Runtime = false;
 
-  // Go through all exits of L and see if there are any phi-nodes there. We just
-  // conservatively assume that they're inserted to preserve LCSSA form, which
-  // means that complete unrolling might break this form. We need to either fix
-  // it in-place after the transformation, or entirely rebuild LCSSA. TODO: For
-  // now we just recompute LCSSA for the outer loop, but it should be possible
-  // to fix it in-place.
-  bool NeedToFixLCSSA =
-      PreserveLCSSA && CompletelyUnroll &&
-      any_of(ExitBlocks,
-             [](const BasicBlock *BB) { return isa<PHINode>(BB->begin()); });
-
   // The current loop unroll pass can unroll loops that have
   // (1) single latch; and
   // (2a) latch is unconditional; or
@@ -587,21 +535,87 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
                                               : isEpilogProfitable(L);
 
+  LoopReminderUnrollResult UnrollReminderResult =
+      LoopReminderUnrollResult::Unmodified;
+  if (ULO.Runtime) {
+    UnrollReminderResult = UnrollRuntimeLoopRemainder(
+        L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability,
+        ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
+        PreserveLCSSA, ULO.SCEVExpansionBudget, ULO.RuntimeUnrollMultiExit,
+        RemainderLoop);
+    LatchBlock = L->getLoopLatch();
+    LatchIsExiting = L->isLoopExiting(LatchBlock);
+  }
+
   if (ULO.Runtime &&
-      !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount,
-                                  EpilogProfitability, ULO.UnrollRemainder,
-                                  ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
-                                  PreserveLCSSA, ULO.SCEVExpansionBudget,
-                                  ULO.RuntimeUnrollMultiExit, RemainderLoop)) {
+      UnrollReminderResult != LoopReminderUnrollResult::Unrolled) {
     if (ULO.Force)
       ULO.Runtime = false;
     else {
       LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "
                            "generated when assuming runtime trip count\n");
-      return LoopUnrollResult::Unmodified;
+      // Loop might have been rotated inside of UnrollRuntimeLoopRemainder and
+      // this needs to be propagated.
+      return UnrollReminderResult == LoopReminderUnrollResult::Rotated
+                 ? LoopUnrollResult::Modified
+                 : LoopUnrollResult::Unmodified;
+      ;
     }
   }
 
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *Header = L->getHeader();
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  std::vector<BasicBlock *> OriginalLoopBlocks = L->getBlocks();
+
+  // Go through all exits of L and see if there are any phi-nodes there. We just
+  // conservatively assume that they're inserted to preserve LCSSA form, which
+  // means that complete unrolling might break this form. We need to either fix
+  // it in-place after the transformation, or entirely rebuild LCSSA. TODO: For
+  // now we just recompute LCSSA for the outer loop, but it should be possible
+  // to fix it in-place.
+  bool NeedToFixLCSSA =
+      PreserveLCSSA && CompletelyUnroll &&
+      any_of(ExitBlocks,
+             [](const BasicBlock *BB) { return isa<PHINode>(BB->begin()); });
+
+  struct ExitInfo {
+    unsigned TripCount;
+    unsigned TripMultiple;
+    unsigned BreakoutTrip;
+    bool ExitOnTrue;
+    BasicBlock *FirstExitingBlock = nullptr;
+    SmallVector<BasicBlock *> ExitingBlocks;
+  };
+  DenseMap<BasicBlock *, ExitInfo> ExitInfos;
+  SmallVector<BasicBlock *, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+  for (auto *ExitingBlock : ExitingBlocks) {
+    // The folding code is not prepared to deal with non-branch instructions
+    // right now.
+    auto *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+    if (!BI)
+      continue;
+
+    ExitInfo &Info = ExitInfos[ExitingBlock];
+    Info.TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
+    Info.TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
+    if (Info.TripCount != 0) {
+      Info.BreakoutTrip = Info.TripCount % ULO.Count;
+      Info.TripMultiple = 0;
+    } else {
+      Info.BreakoutTrip = Info.TripMultiple =
+          (unsigned)std::gcd(ULO.Count, Info.TripMultiple);
+    }
+    Info.ExitOnTrue = !L->contains(BI->getSuccessor(0));
+    Info.ExitingBlocks.push_back(ExitingBlock);
+    LLVM_DEBUG(dbgs() << "  Exiting block %" << ExitingBlock->getName()
+                      << ": TripCount=" << Info.TripCount
+                      << ", TripMultiple=" << Info.TripMultiple
+                      << ", BreakoutTrip=" << Info.BreakoutTrip << "\n");
+  }
+
   using namespace ore;
   // Report the unrolling decision.
   if (CompletelyUnroll) {
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index ca90bb65f5708..43189ddea4603 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -239,14 +239,16 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
 
   // We use the runtime remainder in cases where we don't know trip multiple
   if (TripMultiple % Count != 0) {
-    if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
-                                    /*UseEpilogRemainder*/ true,
-                                    UnrollRemainder, /*ForgetAllSCEV*/ false,
-                                    LI, SE, DT, AC, TTI, true,
-                                    SCEVCheapExpansionBudget, EpilogueLoop)) {
+    auto UnrollReminderResult = UnrollRuntimeLoopRemainder(
+        L, Count, /*AllowExpensiveTripCount*/ false,
+        /*UseEpilogRemainder*/ true, UnrollRemainder, /*ForgetAllSCEV*/ false,
+        LI, SE, DT, AC, TTI, true, SCEVCheapExpansionBudget, EpilogueLoop);
+    if (UnrollReminderResult != LoopReminderUnrollResult::Unrolled) {
       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
                            "generated when assuming runtime trip count\n");
-      return LoopUnrollResult::Unmodified;
+      return UnrollReminderResult == LoopReminderUnrollResult::Rotated
+                 ? LoopUnrollResult::Modified
+                 : LoopUnrollResult::Unmodified;
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index bf882d7406853..142f97b5c3a1a 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -37,6 +37,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopRotationUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
@@ -574,7 +575,7 @@ static Value *CreateTripRemainder(IRBuilder<> &B, Value *BECount,
 ///        if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
 /// EpilExit:
 
-bool llvm::UnrollRuntimeLoopRemainder(
+LoopReminderUnrollResult llvm::UnrollRuntimeLoopRemainder(
     Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
     LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
@@ -586,10 +587,31 @@ bool llvm::UnrollRuntimeLoopRemainder(
   LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
                                 : dbgs() << "Using prolog remainder.\n");
 
+  LoopReminderUnrollResult Result = LoopReminderUnrollResult::Unmodified;
+
+  // Rotate loop if it makes the exit count from the latch computable.
+  BasicBlock *OrigHeader = L->getHeader();
+  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+  if (BI && !BI->isUnconditional() &&
+      isa<SCEVCouldNotCompute>(SE->getExitCount(L, L->getLoopLatch())) &&
+      !isa<SCEVCouldNotCompute>(SE->getExitCount(L, OrigHeader))) {
+    LLVM_DEBUG(
+        dbgs() << "  Rotating loop to make the exit count computable.\n");
+    SimplifyQuery SQ{OrigHeader->getDataLayout()};
+    SQ.TLI = nullptr;
+    SQ.DT = DT;
+    SQ.AC = AC;
+    if (llvm::LoopRotation(L, LI, TTI, AC, DT, SE, nullptr /*MemorySSAUpdater*/,
+                           SQ, false /*RotationOnly*/, 16 /*Threshold*/,
+                           false /*IsUtilMode*/, false /*PrepareForLTO*/,
+                           [](Loop *, ScalarEvolution *) { return true; }))
+      Result = LoopReminderUnrollResult::Rotated;
+  }
+
   // Make sure the loop is in canonical form.
   if (!L->isLoopSimplifyForm()) {
     LLVM_DEBUG(dbgs() << "Not in simplify form!\n");
-    return false;
+    return Result;
   }
 
   // Guaranteed by LoopSimplifyForm.
@@ -603,7 +625,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
     LLVM_DEBUG(
         dbgs()
         << "Loop latch not terminated by a conditional branch.\n");
-    return false;
+    return Result;
   }
 
   unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
@@ -615,7 +637,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
     LLVM_DEBUG(
         dbgs()
         << "One of the loop latch successors must be the exit block.\n");
-    return false;
+    return Result;
   }
 
   // These are exit blocks other than the target of the latch exiting block.
@@ -627,12 +649,12 @@ bool llvm::UnrollRuntimeLoopRemainder(
     // We rely on LCSSA form being preserved when the exit blocks are transformed.
     // (Note that only an off-by-default mode of the old PM disables PreserveLCCA.)
     if (!PreserveLCSSA)
-      return false;
+      return Result;
 
     // Priority goes to UnrollRuntimeMultiExit if it's supplied.
     if (UnrollRuntimeMultiExit.getNumOccurrences()) {
       if (!UnrollRuntimeMultiExit)
-        return false;
+        return Result;
     } else {
       // Otherwise perform multi-exit unrolling, if either the target indicates
       // it is profitable or the general profitability heuristics apply.
@@ -641,14 +663,14 @@ bool llvm::UnrollRuntimeLoopRemainder(
                                                    UseEpilogRemainder)) {
         LLVM_DEBUG(dbgs() << "Multiple exit/exiting blocks in loop and "
                              "multi-exit unrolling not enabled!\n");
-        return false;
+        return Result;
       }
     }
   }
   // Use Scalar Evolution to compute the trip count. This allows more loops to
   // be unrolled than relying on induction var simplification.
   if (!SE)
-    return false;
+    return Result;
 
   // Only unroll loops with a computable trip count.
   // We calculate the backedge count by using getExitCount on the Latch block,
@@ -658,7 +680,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
   const SCEV *BECountSC = SE->getExitCount(L, Latch);
   if (isa<SCEVCouldNotCompute>(BECountSC)) {
     LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n");
-    return false;
+    return Result;
   }
 
   unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
@@ -669,7 +691,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
       SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
   if (isa<SCEVCouldNotCompute>(TripCountSC)) {
     LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
-    return false;
+    return Result;
   }
 
   BasicBlock *PreHeader = L->getLoopPreheader();
@@ -680,7 +702,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
       Expander.isHighCostExpansion(TripCountSC, L, SCEVExpansionBudget, TTI,
                                    PreHeaderBR)) {
     LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
-    return false;
+    return Result;
   }
 
   // This constraint lets us deal with an overflowing trip count easily; see the
@@ -689,7 +711,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
     LLVM_DEBUG(
         dbgs()
         << "Count failed constraint on overflow trip count calculation.\n");
-    return false;
+    return Result;
   }
 
   // Loop structure is the following:
@@ -1035,5 +1057,5 @@ bool llvm::UnrollRuntimeLoopRemainder(
   if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)
     *ResultLoop = remainderLoop;
   NumRuntimeUnrolled++;
-  return true;
+  return LoopReminderUnrollResult::Unrolled;
 }
diff --git a/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll b/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll
index 7f266a754d1bc..2414206e0ce08 100644
--- a/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll
+++ b/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll
@@ -10,10 +10,10 @@
 ; LOOP-UNROLL-LABEL: Loop Unroll: F[pragma_unroll] Loop %for.body
 ; LOOP-UNROLL-NEXT: Loop Size = 9
 ; LOOP-UNROLL-NEXT: runtime unrolling with count: 8
-; LOOP-UNROLL-NEXT: Exiting block %for.body: TripCount=0, TripMultiple=1, BreakoutTrip=1
 ; LOOP-UNROLL-NEXT: Trying runtime unrolling on Loop:
 ; LOOP-UNROLL-NEXT: Loop at depth 1 containing: %for.body<header><latch><exiting>
 ; LOOP-UNROLL-NEXT: Using epilog remainder.
+; LOOP-UNROLL-NEXT: Exiting block %for.body: TripCount=0, TripMultiple=8, BreakoutTrip=8
 ; LOOP-UNROLL-NEXT: UNROLLING loop %for.body by 8 with run-time trip count!
 
 ; LOOP-UNROLL-FULL-LABEL: Loop Unroll: F[pragma_unroll] Loop %for.body
@@ -49,10 +49,10 @@ for.body:                                         ; preds = %for.body.preheader,
 
 ; LOOP-UNROLL-LABEL: Loop Unroll: F[pragma_unroll_count1] Loop %for.body
 ; LOOP-UNROLL-NEXT: Loop Size = 9
-; LOOP-UNROLL-NEXT: Exiting block %for.body: TripCount=0, TripMultiple=1, BreakoutTrip=1
 ; LOOP-UNROLL-NEXT: Trying runtime unrolling on Loop:
 ; LOOP-UNROLL-NEXT: Loop at depth 1 containing: %for.body<header><latch><exiting>
 ; LOOP-UNROLL-NEXT: Using epilog remainder.
+; LOOP-UNROLL-NEXT: Exiting block %for.body: TripCount=0, TripMultiple=1, BreakoutTrip=1
 ; LOOP-UNROLL-NEXT: UNROLLING loop %for.body by 5 with run-time trip count!
 
 ; LOOP-UNROLL-FULL-LABEL: Loop Unroll: F[pragma_unroll_count1] Loop %for.body
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop-multiexit-dom-verify.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop-multiexit-dom-verify.ll
index de54852313456..b079abefaea65 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop-multiexit-dom-verify.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop-multiexit-dom-verify.ll
@@ -16,48 +16,86 @@ define i64 @test1() {
 ; CHECK-NEXT:    br label [[PREHEADER:%.*]]
 ; CHECK:       preheader:
 ; CHECK-NEXT:    [[TRIP:%.*]] = zext i32 undef to i64
+; CHECK-NEXT:    br i1 false, label [[LATCH_LR_PH:%.*]], label [[HEADEREXIT:%.*]]
+; CHECK:       latch.lr.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[TRIP]], -5
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP3]], 3
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[LATCH_PROL_PREHEADER:%.*]], label [[LATCH_PROL_LOOPEXIT:%.*]]
+; CHECK:       latch.prol.preheader:
 ; CHECK-NEXT:    br label [[HEADER:%.*]]
-; CHECK:       header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 2, [[PREHEADER]] ], [ [[ADD_IV_3:%.*]], [[LATCH_3:%.*]] ]
-; CHECK-NEXT:    [[ADD_IV:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK:       latch.prol:
+; CHECK-NEXT:    [[ADD_IV1_PROL:%.*]] = phi i64 [ 4, [[LATCH_PROL_PREHEADER]] ], [ [[ADD_IV:%.*]], [[HEADER_PROL:%.*]] ]
+; CHECK-NEXT:    [[PROL_ITER:%.*]] = phi i64 [ 0, [[LATCH_PROL_PREHEADER]] ], [ [[PROL_ITER_NEXT:%.*]], [[HEADER_PROL]] ]
+; CHECK-NEXT:    [[SHFT_PROL:%.*]] = ashr i64 [[ADD_IV1_PROL]], 1
+; CHECK-NEXT:    [[CMP2_PROL:%.*]] = icmp ult i64 [[SHFT_PROL]], [[TRIP]]
+; CHECK-NEXT:    br i1 [[CMP2_PROL]], label [[HEADER_PROL]], label [[LATCHEXIT_LOOPEXIT2:%.*]]
+; CHECK:       header.prol:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[ADD_IV1_PROL]], [[HEADER]] ]
+; CHECK-NEXT:    [[ADD_IV]] = add nuw nsw i64 [[IV]], 2
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[ADD_IV]], [[TRIP]]
-; CHECK-NEXT:    br i1 [[CMP1]], label [[LATCH:%.*]], label [[HEADEREXIT:%.*]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[SHFT:%.*]] = ashr i64 [[ADD_IV]], 1
+; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
+; CHECK-NEXT:    [[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
+; CHECK-NEXT:    br i1 [[PROL_ITER_CMP]], label [[HEADER]], label [[LATCH_PROL_LOOPEXIT_UNR_LCSSA:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       latch.prol.loopexit.unr-lcssa:
+; CHECK-NEXT:    [[ADD_IV1_UNR_PH:%.*]] = phi i64 [ [[ADD_IV]], [[HEADER_PROL]] ]
+; CHECK-NEXT:    [[SPLIT_UNR_PH:%.*]] = phi i64 [ [[ADD_IV]], [[HEADER_PROL]] ]
+; CHECK-NEXT:    br label [[LATCH_PROL_LOOPEXIT]]
+; CHECK:       latch.prol.loopexit:
+; CHECK-NEXT:    [[ADD_IV1_UNR:%.*]] = phi i64 [ 4, [[LATCH_LR_PH]] ], [ [[ADD_IV1_UNR_PH]], [[LATCH_PROL_LOOPEXIT_UNR_LCSSA]] ]
+; CHECK-NEXT:    [[SPLIT_UNR:%.*]] = phi i64 [ poison, [[LATCH_LR_PH]] ], [ [[SPLIT_UNR_PH]], [[LATCH_PROL_LOOPEXIT_UNR_LCSSA]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 3
+; CHECK-NEXT:    br i1 [[TMP5]], label [[HEADER_HEADEREXIT_CRIT_EDGE:%.*]], label [[LATCH_LR_PH_NEW:%.*]]
+; CHECK:       latch.lr.ph.new:
+; CHECK-NEXT:    br label [[LATCH:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[ADD_IV2:%.*]] = add nuw nsw i64 [[ADD_IV1:%.*]], 2
+; CHECK-NEXT:    [[SHFT:%.*]] = ashr i64 [[ADD_IV2]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[SHFT]], [[TRIP]]
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[HEADER_1:%.*]], label [[LATCHEXIT:%.*]]
 ; CHECK:       header.1:
-; CHECK-NEXT:    [[ADD_IV_1:%.*]] = add nuw nsw i64 [[IV]], 4
-; CHECK-NEXT:    [[CMP1_1:%.*]] = icmp ult i64 [[ADD_IV_1]], [[TRIP]]
-; CHECK-NEXT:    br i1 [[CMP1_1]], label [[LATCH_1:%.*]], label [[HEADEREXIT]]
-; CHECK:       latch.1:
+; CHECK-NEXT:    [[ADD_IV_1:%.*]] = add nuw nsw i64 [[ADD_IV1]], 4
 ; CHECK-NEXT:    [[SHFT_1:%.*]] = ashr i64 [[ADD_IV_1]], 1
 ; CHECK-NEXT:    [[CMP2_1:%.*]] = icmp ult i64 [[SHFT_1]], [[TRIP]]
 ; CHECK-NEXT:    br i1 [[CMP2_1]], label [[HEADER_2:%.*]], label [[LATCHEXIT]]
 ; CHECK:       header.2:
-; CHECK-NEXT:    [[ADD_IV_2:%.*]] = add nuw nsw i64 [[IV]], 6
-; CHECK-NEXT:    [[CMP1_2:%.*]] = icmp ult i64 [[ADD_IV_2]], [[TRIP]]
-; CHECK-NEXT:    br i1 [[CMP1_2]], label [[LATCH_2:%.*]], label [[HEADEREXIT]]
-; CHECK:       latch.2:
+; CHECK-NEXT:    [[ADD_IV_2:%.*]] = add nuw nsw i64 [[ADD_IV1]], 6
 ; CHECK-NEXT:    [[SHFT_2:%.*]] = ashr i64 [[ADD_IV_2]], 1
 ; CHECK-NEXT:    [[CMP2_2:%.*]] = icmp ult i64 [[SHFT_2]], [[TRIP]]
 ; CHECK-NEXT:    br i1 [[CMP2_2]], label [[HEADER_3:%.*]], label [[LATCHEXIT]]
 ; CHECK:       header.3:
-; CHECK-NEXT:    [[ADD_IV_3]] = add nuw nsw i64 [[IV]], 8
+; CHECK-NEXT:    [[ADD_IV_3:%.*]] = add nuw nsw i64 [[ADD_IV1]], 8
 ; CHECK-NEXT:    [[CMP1_3:%.*]] = icmp ult i64 [[ADD_IV_3]], [[TRIP]]
-; CHECK-NEXT:    br i1 [[CMP1_3]], label [[LATCH_3]], label [[HEADEREXIT]]
-; CHECK:       latch.3:
-; CHECK-NEXT:    [[SHFT_3:%.*]] = ashr i64 [[ADD_IV_3]], 1
+; CHECK-NEXT:    br i1 [[CMP1_3]], label [[LATCH]], label [[HEADER_HEADEREXIT_CRIT_EDGE_UNR_LCSSA:%.*]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[ADD_IV1]] = phi i64 [ [[ADD_IV1_UNR]], [[LATCH_LR_PH_NEW]] ], [ [[ADD_IV_3]], [[HEADER_3]] ]
+; CHECK-NEXT:    [[SHFT_3:%.*]] = ashr i64 [[ADD_IV1]], 1
 ; CHECK-NEXT:    [[CMP2_3:%.*]] = icmp ult i64 [[SHFT_3]], [[TRIP]]
-; CHECK-NEXT:    br i1 [[CMP2_3]], label [[HEADER]], label [[LATCHEXIT]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP2_3]], label [[HEADER1:%.*]], label [[LATCHEXIT]]
+; CHECK:       header.headerexit_crit_edge.unr-lcssa:
+; CHECK-NEXT:    [[SPLIT_PH:%.*]] = phi i64 [ [[ADD_IV_3]], [[HEADER_3]] ]
+; CHECK-NEXT:    br label [[HEADER_HEADEREXIT_CRIT_EDGE]]
+; CHECK:       header.headerexit_crit_edge:
+; CHECK-NEXT:    [[SPLIT:%.*]] = phi i64 [ [[SPLIT_UNR]], [[LATCH_PROL_LOOPEXIT]] ], [ [[SPLIT_PH]], [[HEADER_HEADEREXIT_CRIT_EDGE_UNR_LCSSA]] ]
+; CHECK-NEXT:    br label [[HEADEREXIT]]
 ; CHECK:       headerexit:
-; CHECK-NEXT:    [[ADDPHI:%.*]] = phi i64 [ [[ADD_IV]], [[HEADER]] ], [ [[ADD_IV_1]], [[HEADER_1]] ], [ [[ADD_IV_2]], [[HEADER_2]] ], [ [[ADD_IV_3]], [[HEADER_3]] ]
+; CHECK-NEXT:    [[ADDPHI:%.*]] = phi i64 [ [[SPLIT]], [[HEADER_HEADEREXIT_CRIT_EDGE]] ], [ 4, [[PREHEADER]] ]
+; CHECK-NEXT:    br label [[MERGEDEXIT1:%.*]]
+; CHECK:       latchexit.loopexit:
+; CHECK-NEXT:    [[SHFTPHI_PH:%.*]] = phi i64 [ [[SHFT_3]], [[LATCH]] ], [ [[SHFT]], [[HEADER1]] ], [ [[SHFT_1]], [[HEADER_1]] ], [ [[SHFT_2]], [[HEADER_2]] ]
 ; CHECK-NEXT:    br label [[MERGEDEXIT:%.*]]
-; CHECK:       latchexit:
-; CHECK-NEXT:    [[SHFTPHI:%.*]] = phi i64 [ [[SHFT]], [[LATCH]] ], [ [[SHFT_1]], [[LATCH_1]] ], [ [[SHFT_2]], [[LATCH_2]] ], [ [[SHFT_3]], [[LATCH_3]] ]
+; CHECK:       latchexit.loopexit2:
+; CHECK-NEXT:    [[SHFTPHI_PH3:%.*]] = phi i64 [ [[SHFT_PROL]], [[HEADER]] ]
 ; CHECK-NEXT:    br label [[MERGEDEXIT]]
+; CHECK:       latchexit:
+; CHECK-NEXT:    [[SHFTPHI:%.*]] = phi i64 [ [[SHFTPHI_PH]], [[LATCHEXIT]] ], [ [[SHFTPHI_PH3]], [[LATCHEXIT_LOOPEXIT2]] ]
+; CHECK-NEXT:    br label [[MERGEDEXIT1]]
 ; CHECK:       mergedexit:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[ADDPHI]], [[HEADEREXIT]] ], [ [[SHFTPHI]], [[LATCHEXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[ADDPHI]], [[HEADEREXIT]] ], [ [[SHFTPHI]], [[MERGEDEXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -98,42 +136,75 @@ define  void @test2(i1 %cond, i32 %n) {
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[PREHEADER:%.*]], label [[MERGEDEXIT:%.*]]
 ; CHECK:       preheader:
 ; CHECK-NEXT:    [[TRIP:%.*]] = zext i32 [[N:%.*]] to i64
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp ult i64 4, [[TRIP]]
+; CHECK-NEXT:    br i1 [[CMP11]], label [[LATCH_LR_PH:%.*]], label [[HEADEREXIT:%.*]]
+; CHECK:       latch.lr.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[TRIP]], -5
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP3]], 3
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[LATCH_PROL_PREHEADER:%.*]], label [[LATCH_PROL_LOOPEXIT:%.*]]
+; CHECK:       latch.prol.preheader:
 ; CHECK-NEXT:    br label [[HEADER:%.*]]
-; CHECK:       header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 2, [[PREHEADER]] ], [ [[ADD_IV_3:%.*]], [[LATCH_3:%.*]] ]
-; CHECK-NEXT:    [[ADD_IV:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK:       latch.prol:
+; CHECK-NEXT:    [[ADD_IV2_PROL:%.*]] = phi i64 [ 4, [[LATCH_PROL_PREHEADER]] ], [ [[ADD_IV:%.*]], [[HEADER_PROL:%.*]] ]
+; CHECK-NEXT:    [[PROL_ITER:%.*]] = phi i64 [ 0, [[LATCH_PROL_PREHEADER]] ], [ [[PROL_ITER_NEXT:%.*]], [[HEADER_PROL]] ]
+; CHECK-NEXT:    [[SHFT_PROL:%.*]] = ashr i64 [[ADD_IV2_PROL]], 1
+; CHECK-NEXT:    [[CMP2_PROL:%.*]] = icmp ult i64 [[SHFT_PROL]], [[TRIP]]
+; CHECK-NEXT:    br i1 [[CMP2_PROL]], label [[HEADER_PROL]], label [[LATCHEXIT_LOOPEXIT3:%.*]]
+; CHECK:       header.prol:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[ADD_IV2_PROL]], [[HEADER]] ]
+; CHECK-NEXT:    [[ADD_IV]] = add nuw nsw i64 [[IV]], 2
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[ADD_IV]], [[TRIP]]
-; CHECK-NEXT:    br i1 [[CMP1]], label [[LATCH:%.*]], label [[HEADEREXIT:%.*]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[SHFT:%.*]] = ashr i64 [[ADD_IV]], 1
+; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
+; CHECK-NEXT:    [[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
+; CHECK-NEXT:    br i1 [[PROL_ITER_CMP]], label [[HEADER]], label [[LATCH_PROL_LOOPEXIT_UNR_LCSSA:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       latch.prol.loopexit.unr-lcssa:
+; CHECK-NEXT:    [[ADD_IV2_UNR_PH:%.*]] = phi i64 [ [[ADD_IV]], [[HEADER_PROL]] ]
+; CHECK-NEXT:    br label [[LATCH_PROL_LOOPEXIT]]
+; CHECK:       latch.prol.loopexit:
+; CHECK-NEXT:    [[ADD_IV2_UNR:%.*]] = phi i64 [ 4, [[LATCH_LR_PH]] ], [ [[ADD_IV2_UNR_PH]], [[LATCH_PROL_LOOPEXIT_UNR_LCSSA]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 3
+; CHECK-NEXT:    br i1 [[TMP5]], label [[HEADER_HEADEREXIT_CRIT_EDGE:%.*]], label [[LATCH_LR_PH_NEW:%.*]]
+; CHECK:       latch.lr.ph.new:
+; CHECK-NEXT:    br label [[LATCH:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[ADD_IV1:%.*]] = add nuw nsw i64 [[ADD_IV2:%.*]], 2
+; CHECK-NEXT:    [[SHFT:%.*]] = ashr i64 [[ADD_IV1]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[SHFT]], [[TRIP]]
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[HEADER_1:%.*]], label [[LATCHEXIT:%.*]]
 ; CHECK:       header.1:
-; CHECK-NEXT:    [[ADD_IV_1:%.*]] = add nuw nsw i64 [[IV]], 4
-; CHECK-NEXT:    [[CMP1_1:%.*]] = icmp ult i64 [[ADD_IV_1]], [[TRIP]]
-; CHECK-NEXT:    br i1 [[CMP1_1]], label [[LATCH_1:%.*]], label [[HEADEREXIT]]
-; CHECK:       latch.1:
+; CHECK-NEXT:    [[ADD_IV_1:%.*]] = add nuw nsw i64 [[ADD_IV2]], 4
 ; CHECK-NEXT:    [[SHFT_1:%.*]] = ashr i64 [[ADD_IV_1]], 1
 ; CHECK-NEXT:    [[CMP2_1:%.*]] = icmp ult i64 [[SHFT_1]], [[TRIP]]
 ; CHECK-NEXT:    br i1 [[CMP2_1]], label [[HEADER_2:%.*]], label [[LATCHEXIT]]
 ; CHECK:       header.2:
-; CHECK-NEXT:    [[ADD_IV_2:%.*]] = add nuw nsw i64 [[IV]], 6
-; CHECK-NEXT:    [[CMP1_2:%.*]] = icmp ult i64 [[ADD_IV_2]], [[TRIP]]
-; CHECK-NEXT:    br i1 [[CMP1_2]], label [[LATCH_2:%.*]], label [[HEADEREXIT]]
-; CHECK:       latch.2:
+; CHECK-NEXT:    [[ADD_IV_2:%.*]] = add nuw nsw i64 [[ADD_IV2]], 6
 ; CHECK-NEXT:    [[SHFT_2:%.*]] = ashr i64 [[ADD_IV_2]], 1
 ; CHECK-NEXT:    [[CMP2_2:%.*]] = icmp ult i64 [[SHFT_2]], [[TRIP]]
 ; CHECK-NEXT:    br i1 [[CMP2_2]], label [[HEADER_3:%.*]], label [[LATCHEXIT]]
 ; CHECK:       header.3:
-; CHECK-NEXT:    [[ADD_IV_3]] = add nuw nsw i64 [[IV]], 8
+; CHECK-NEXT:    [[ADD_IV_3:%.*]] = add nuw nsw i64 [[ADD_IV2]], 8
 ; CHECK-NEXT:    [[CMP1_3:%.*]] = icmp ult i64 [[ADD_IV_3]], [[TRIP]]
-; CHECK-NEXT:    br i1 [[CMP1_3]], label [[LATCH_3]], label [[HEADEREXIT]]
-; CHECK:       latch.3:
-; CHECK-NEXT:    [[SHFT_3:%.*]] = ashr i64 [[ADD_IV_3]], 1
+; CHECK-NEXT:    br i1 [[CMP1_3]], label [[LATCH]], label [[HEADER_HEADEREXIT_CRIT_EDGE_UNR_LCSSA:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[ADD_IV2]] = phi i64 [ [[ADD_IV2_UNR]], [[LATCH_LR_PH_NEW]] ], [ [[ADD_IV_3]], [[HEADER_3]] ]
+; CHECK-NEXT:    [[SHFT_3:%.*]] = ashr i64 [[ADD_IV2]], 1
 ; CHECK-NEXT:    [[CMP2_3:%.*]] = icmp ult i64 [[SHFT_3]], [[TRIP]]
-; CHECK-NEXT:    br i1 [[CMP2_3]], label [[HEADER]], label [[LATCHEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP2_3]], label [[HEADER1:%.*]], label [[LATCHEXIT]]
+; CHECK:       header.headerexit_crit_edge.unr-lcssa:
+; CHECK-NEXT:    br label [[HEADER_HEADEREXIT_CRIT_EDGE]]
+; CHECK:       header.headerexit_crit_edge:
+; CHECK-NEXT:    br label [[HEADEREXIT]]
 ; CHECK:       headerexit:
 ; CHECK-NEXT:    br label [[MERGEDEXIT]]
+; CHECK:       latchexit.loopexit:
+; CHECK-NEXT:    br label [[LATCHEXIT1:%.*]]
+; CHECK:       latchexit.loopexit3:
+; CHECK-NEXT:    br label [[LATCHEXIT1]]
 ; CHECK:       latchexit:
 ; CHECK-NEXT:    br label [[MERGEDEXIT]]
 ; CHECK:       mergedexit:
@@ -175,44 +246,79 @@ define i64 @test3(i32 %n) {
 ; CHECK-NEXT:    br label [[PREHEADER:%.*]]
 ; CHECK:       preheader:
 ; CHECK-NEXT:    [[TRIP:%.*]] = zext i32 [[N:%.*]] to i64
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp ult i64 4, [[TRIP]]
+; CHECK-NEXT:    br i1 [[CMP11]], label [[LATCH_LR_PH:%.*]], label [[HEADEREXIT:%.*]]
+; CHECK:       latch.lr.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[TRIP]], -5
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP3]], 3
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[LATCH_PROL_PREHEADER:%.*]], label [[LATCH_PROL_LOOPEXIT:%.*]]
+; CHECK:       latch.prol.preheader:
 ; CHECK-NEXT:    br label [[HEADER:%.*]]
-; CHECK:       header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 2, [[PREHEADER]] ], [ [[ADD_IV_3:%.*]], [[LATCH_3:%.*]] ]
-; CHECK-NEXT:    [[ADD_IV:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK:       latch.prol:
+; CHECK-NEXT:    [[ADD_IV2_PROL:%.*]] = phi i64 [ 4, [[LATCH_PROL_PREHEADER]] ], [ [[ADD_IV:%.*]], [[HEADER_PROL:%.*]] ]
+; CHECK-NEXT:    [[PROL_ITER:%.*]] = phi i64 [ 0, [[LATCH_PROL_PREHEADER]] ], [ [[PROL_ITER_NEXT:%.*]], [[HEADER_PROL]] ]
+; CHECK-NEXT:    [[SHFT_PROL:%.*]] = ashr i64 [[ADD_IV2_PROL]], 1
+; CHECK-NEXT:    [[CMP2_PROL:%.*]] = icmp ult i64 [[SHFT_PROL]], [[TRIP]]
+; CHECK-NEXT:    br i1 [[CMP2_PROL]], label [[HEADER_PROL]], label [[LATCHEXIT_LOOPEXIT3:%.*]]
+; CHECK:       header.prol:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[ADD_IV2_PROL]], [[HEADER]] ]
+; CHECK-NEXT:    [[ADD_IV]] = add nuw nsw i64 [[IV]], 2
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[ADD_IV]], [[TRIP]]
-; CHECK-NEXT:    br i1 [[CMP1]], label [[LATCH:%.*]], label [[HEADEREXIT:%.*]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[SHFT:%.*]] = ashr i64 [[ADD_IV]], 1
+; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
+; CHECK-NEXT:    [[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
+; CHECK-NEXT:    br i1 [[PROL_ITER_CMP]], label [[HEADER]], label [[LATCH_PROL_LOOPEXIT_UNR_LCSSA:%.*]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       latch.prol.loopexit.unr-lcssa:
+; CHECK-NEXT:    [[ADD_IV2_UNR_PH:%.*]] = phi i64 [ [[ADD_IV]], [[HEADER_PROL]] ]
+; CHECK-NEXT:    br label [[LATCH_PROL_LOOPEXIT]]
+; CHECK:       latch.prol.loopexit:
+; CHECK-NEXT:    [[ADD_IV2_UNR:%.*]] = phi i64 [ 4, [[LATCH_LR_PH]] ], [ [[ADD_IV2_UNR_PH]], [[LATCH_PROL_LOOPEXIT_UNR_LCSSA]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 3
+; CHECK-NEXT:    br i1 [[TMP5]], label [[HEADER_HEADEREXIT_CRIT_EDGE:%.*]], label [[LATCH_LR_PH_NEW:%.*]]
+; CHECK:       latch.lr.ph.new:
+; CHECK-NEXT:    br label [[LATCH:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[ADD_IV1:%.*]] = add nuw nsw i64 [[ADD_IV2:%.*]], 2
+; CHECK-NEXT:    [[SHFT:%.*]] = ashr i64 [[ADD_IV1]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[SHFT]], [[TRIP]]
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[HEADER_1:%.*]], label [[LATCHEXIT:%.*]]
 ; CHECK:       header.1:
-; CHECK-NEXT:    [[ADD_IV_1:%.*]] = add nuw nsw i64 [[IV]], 4
-; CHECK-NEXT:    [[CMP1_1:%.*]] = icmp ult i64 [[ADD_IV_1]], [[TRIP]]
-; CHECK-NEXT:    br i1 [[CMP1_1]], label [[LATCH_1:%.*]], label [[HEADEREXIT]]
-; CHECK:       latch.1:
+; CHECK-NEXT:    [[ADD_IV_1:%.*]] = add nuw nsw i64 [[ADD_IV2]], 4
 ; CHECK-NEXT:    [[SHFT_1:%.*]] = ashr i64 [[ADD_IV_1]], 1
 ; CHECK-NEXT:    [[CMP2_1:%.*]] = icmp ult i64 [[SHFT_1]], [[TRIP]]
 ; CHECK-NEXT:    br i1 [[CMP2_1]], label [[HEADER_2:%.*]], label [[LATCHEXIT]]
 ; CHECK:       header.2:
-; CHECK-NEXT:    [[ADD_IV_2:%.*]] = add nuw nsw i64 [[IV]], 6
-; CHECK-NEXT:    [[CMP1_2:%.*]] = icmp ult i64 [[ADD_IV_2]], [[TRIP]]
-; CHECK-NEXT:    br i1 [[CMP1_2]], label [[LATCH_2:%.*]], label [[HEADEREXIT]]
-; CHECK:       latch.2:
+; CHECK-NEXT:    [[ADD_IV_2:%.*]] = add nuw nsw i64 [[ADD_IV2]], 6
 ; CHECK-NEXT:    [[SHFT_2:%.*]] = ashr i64 [[ADD_IV_2]], 1
 ; CHECK-NEXT:    [[CMP2_2:%.*]] = icmp ult i64 [[SHFT_2]], [[TRIP]]
 ; CHECK-NEXT:    br i1 [[CMP2_2]], label [[HEADER_3:%.*]], label [[LATCHEXIT]]
 ; CHECK:       header.3:
-; CHECK-NEXT:    [[ADD_IV_3]] = add nuw nsw i64 [[IV]], 8
+; CHECK-NEXT:    [[ADD_IV_3:%.*]] = add nuw nsw i64 [[ADD_IV2]], 8
 ; CHECK-NEXT:    [[CMP1_3:%.*]] = icmp ult i64 [[ADD_IV_3]], [[TRIP]]
-; CHECK-NEXT:    br i1 [[CMP1_3]], label [[LATCH_3]], label [[HEADEREXIT]]
-; CHECK:       latch.3:
-; CHECK-NEXT:    [[SHFT_3:%.*]] = ashr i64 [[ADD_IV_3]], 1
+; CHECK-NEXT:    br i1 [[CMP1_3]], label [[LATCH]], label [[HEADER_HEADEREXIT_CRIT_EDGE_UNR_LCSSA:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[ADD_IV2]] = phi i64 [ [[ADD_IV2_UNR]], [[LATCH_LR_PH_NEW]] ], [ [[ADD_IV_3]], [[HEADER_3]] ]
+; CHECK-NEXT:    [[SHFT_3:%.*]] = ashr i64 [[ADD_IV2]], 1
 ; CHECK-NEXT:    [[CMP2_3:%.*]] = icmp ult i64 [[SHFT_3]], [[TRIP]]
-; CHECK-NEXT:    br i1 [[CMP2_3]], label [[HEADER]], label [[LATCHEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP2_3]], label [[HEADER1:%.*]], label [[LATCHEXIT]]
+; CHECK:       header.headerexit_crit_edge.unr-lcssa:
+; CHECK-NEXT:    br label [[HEADER_HEADEREXIT_CRIT_EDGE]]
+; CHECK:       header.headerexit_crit_edge:
+; CHECK-NEXT:    br label [[HEADEREXIT]]
 ; CHECK:       headerexit:
 ; CHECK-NEXT:    br label [[EXITSUCC:%.*]]
+; CHECK:       latchexit.loopexit:
+; CHECK-NEXT:    [[SHFTPHI_PH:%.*]] = phi i64 [ [[SHFT_3]], [[LATCH]] ], [ [[SHFT]], [[HEADER1]] ], [ [[SHFT_1]], [[HEADER_1]] ], [ [[SHFT_2]], [[HEADER_2]] ]
+; CHECK-NEXT:    br label [[LATCHEXIT1:%.*]]
+; CHECK:       latchexit.loopexit3:
+; CHECK-NEXT:    [[SHFTPHI_PH4:%.*]] = phi i64 [ [[SHFT_PROL]], [[HEADER]] ]
+; CHECK-NEXT:    br label [[LATCHEXIT1]]
 ; CHECK:       latchexit:
-; CHECK-NEXT:    [[SHFTPHI:%.*]] = phi i64 [ [[SHFT]], [[LATCH]] ], [ [[SHFT_1]], [[LATCH_1]] ], [ [[SHFT_2]], [[LATCH_2]] ], [ [[SHFT_3]], [[LATCH_3]] ]
+; CHECK-NEXT:    [[SHFTPHI:%.*]] = phi i64 [ [[SHFTPHI_PH]], [[LATCHEXIT]] ], [ [[SHFTPHI_PH4]], [[LATCHEXIT_LOOPEXIT3]] ]
 ; CHECK-NEXT:    ret i64 [[SHFTPHI]]
 ; CHECK:       exitsucc:
 ; CHECK-NEXT:    ret i64 96
@@ -265,15 +371,15 @@ define void @test4(i16 %c3) {
 ; CHECK-NEXT:    br label [[EXITING_PROL:%.*]]
 ; CHECK:       exiting.prol:
 ; CHECK-NEXT:    switch i16 [[C3:%.*]], label [[DEFAULT_LOOPEXIT_LOOPEXIT1:%.*]] [
-; CHECK-NEXT:    i16 45, label [[OTHEREXIT_LOOPEXIT2:%.*]]
-; CHECK-NEXT:    i16 95, label [[LATCH_PROL]]
+; CHECK-NEXT:      i16 45, label [[OTHEREXIT_LOOPEXIT2:%.*]]
+; CHECK-NEXT:      i16 95, label [[LATCH_PROL]]
 ; CHECK-NEXT:    ]
 ; CHECK:       latch.prol:
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1
 ; CHECK-NEXT:    [[C2_PROL:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT_PROL]], [[C1]]
 ; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
 ; CHECK-NEXT:    [[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
-; CHECK-NEXT:    br i1 [[PROL_ITER_CMP]], label [[HEADER_PROL]], label [[HEADER_PROL_LOOPEXIT_UNR_LCSSA:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[PROL_ITER_CMP]], label [[HEADER_PROL]], label [[HEADER_PROL_LOOPEXIT_UNR_LCSSA:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       header.prol.loopexit.unr-lcssa:
 ; CHECK-NEXT:    [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL]], [[LATCH_PROL]] ]
 ; CHECK-NEXT:    br label [[HEADER_PROL_LOOPEXIT]]
@@ -288,34 +394,34 @@ define void @test4(i16 %c3) {
 ; CHECK-NEXT:    br label [[EXITING:%.*]]
 ; CHECK:       exiting:
 ; CHECK-NEXT:    switch i16 [[C3]], label [[DEFAULT_LOOPEXIT_LOOPEXIT:%.*]] [
-; CHECK-NEXT:    i16 45, label [[OTHEREXIT_LOOPEXIT:%.*]]
-; CHECK-NEXT:    i16 95, label [[LATCH:%.*]]
+; CHECK-NEXT:      i16 45, label [[OTHEREXIT_LOOPEXIT:%.*]]
+; CHECK-NEXT:      i16 95, label [[LATCH:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       latch:
 ; CHECK-NEXT:    br label [[EXITING_1:%.*]]
 ; CHECK:       exiting.1:
 ; CHECK-NEXT:    switch i16 [[C3]], label [[DEFAULT_LOOPEXIT_LOOPEXIT]] [
-; CHECK-NEXT:    i16 45, label [[OTHEREXIT_LOOPEXIT]]
-; CHECK-NEXT:    i16 95, label [[LATCH_1:%.*]]
+; CHECK-NEXT:      i16 45, label [[OTHEREXIT_LOOPEXIT]]
+; CHECK-NEXT:      i16 95, label [[LATCH_1:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       latch.1:
 ; CHECK-NEXT:    br label [[EXITING_2:%.*]]
 ; CHECK:       exiting.2:
 ; CHECK-NEXT:    switch i16 [[C3]], label [[DEFAULT_LOOPEXIT_LOOPEXIT]] [
-; CHECK-NEXT:    i16 45, label [[OTHEREXIT_LOOPEXIT]]
-; CHECK-NEXT:    i16 95, label [[LATCH_2:%.*]]
+; CHECK-NEXT:      i16 45, label [[OTHEREXIT_LOOPEXIT]]
+; CHECK-NEXT:      i16 95, label [[LATCH_2:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       latch.2:
 ; CHECK-NEXT:    br label [[EXITING_3:%.*]]
 ; CHECK:       exiting.3:
 ; CHECK-NEXT:    switch i16 [[C3]], label [[DEFAULT_LOOPEXIT_LOOPEXIT]] [
-; CHECK-NEXT:    i16 45, label [[OTHEREXIT_LOOPEXIT]]
-; CHECK-NEXT:    i16 95, label [[LATCH_3]]
+; CHECK-NEXT:      i16 45, label [[OTHEREXIT_LOOPEXIT]]
+; CHECK-NEXT:      i16 95, label [[LATCH_3]]
 ; CHECK-NEXT:    ]
 ; CHECK:       latch.3:
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
 ; CHECK-NEXT:    [[C2_3:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT_3]], [[C1]]
-; CHECK-NEXT:    br i1 [[C2_3]], label [[HEADER]], label [[LATCHEXIT_UNR_LCSSA:%.*]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C2_3]], label [[HEADER]], label [[LATCHEXIT_UNR_LCSSA:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       latchexit.unr-lcssa:
 ; CHECK-NEXT:    br label [[LATCHEXIT]]
 ; CHECK:       latchexit:
@@ -414,13 +520,13 @@ define void @test5() {
 ; CHECK-NEXT:    [[C2_3_PROL:%.*]] = call i1 @unknown(i32 0)
 ; CHECK-NEXT:    br i1 [[C2_3_PROL]], label [[INNERLATCH_3_PROL:%.*]], label [[EXITB_LOOPEXIT_LOOPEXIT2]]
 ; CHECK:       innerLatch.3.prol:
-; CHECK-NEXT:    br i1 false, label [[INNERH_PROL]], label [[OUTERLATCH_PROL]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 false, label [[INNERH_PROL]], label [[OUTERLATCH_PROL]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       outerLatch.prol:
 ; CHECK-NEXT:    [[TMP6_PROL]] = add i32 [[TMP4_PROL]], 1
 ; CHECK-NEXT:    [[TMP7_PROL:%.*]] = icmp sgt i32 [[TMP6_PROL]], 79
 ; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add i32 [[PROL_ITER]], 1
 ; CHECK-NEXT:    [[PROL_ITER_CMP:%.*]] = icmp ne i32 [[PROL_ITER_NEXT]], [[XTRAITER]]
-; CHECK-NEXT:    br i1 [[PROL_ITER_CMP]], label [[OUTERH_PROL]], label [[OUTERH_PROL_LOOPEXIT_UNR_LCSSA:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[PROL_ITER_CMP]], label [[OUTERH_PROL]], label [[OUTERH_PROL_LOOPEXIT_UNR_LCSSA:%.*]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       outerH.prol.loopexit.unr-lcssa:
 ; CHECK-NEXT:    [[TMP4_UNR_PH:%.*]] = phi i32 [ [[TMP6_PROL]], [[OUTERLATCH_PROL]] ]
 ; CHECK-NEXT:    br label [[OUTERH_PROL_LOOPEXIT]]
@@ -464,7 +570,7 @@ define void @test5() {
 ; CHECK-NEXT:    [[C2_3:%.*]] = call i1 @unknown(i32 0)
 ; CHECK-NEXT:    br i1 [[C2_3]], label [[INNERLATCH_3:%.*]], label [[EXITB_LOOPEXIT_LOOPEXIT_LOOPEXIT]]
 ; CHECK:       innerLatch.3:
-; CHECK-NEXT:    br i1 false, label [[INNERH]], label [[OUTERLATCH]], !llvm.loop [[LOOP6]]
+; CHECK-NEXT:    br i1 false, label [[INNERH]], label [[OUTERLATCH]], !llvm.loop [[LOOP9]]
 ; CHECK:       outerLatch:
 ; CHECK-NEXT:    br label [[INNERH_14:%.*]]
 ; CHECK:       innerH.14:
@@ -498,7 +604,7 @@ define void @test5() {
 ; CHECK-NEXT:    [[C2_3_1:%.*]] = call i1 @unknown(i32 0)
 ; CHECK-NEXT:    br i1 [[C2_3_1]], label [[INNERLATCH_3_1:%.*]], label [[EXITB_LOOPEXIT_LOOPEXIT_LOOPEXIT19]]
 ; CHECK:       innerLatch.3.1:
-; CHECK-NEXT:    br i1 false, label [[INNERH_14]], label [[OUTERLATCH_1]], !llvm.loop [[LOOP6]]
+; CHECK-NEXT:    br i1 false, label [[INNERH_14]], label [[OUTERLATCH_1]], !llvm.loop [[LOOP9]]
 ; CHECK:       outerLatch.1:
 ; CHECK-NEXT:    br label [[INNERH_29:%.*]]
 ; CHECK:       innerH.29:
@@ -532,7 +638,7 @@ define void @test5() {
 ; CHECK-NEXT:    [[C2_3_2:%.*]] = call i1 @unknown(i32 0)
 ; CHECK-NEXT:    br i1 [[C2_3_2]], label [[INNERLATCH_3_2:%.*]], label [[EXITB_LOOPEXIT_LOOPEXIT_LOOPEXIT21]]
 ; CHECK:       innerLatch.3.2:
-; CHECK-NEXT:    br i1 false, label [[INNERH_29]], label [[OUTERLATCH_2]], !llvm.loop [[LOOP6]]
+; CHECK-NEXT:    br i1 false, label [[INNERH_29]], label [[OUTERLATCH_2]], !llvm.loop [[LOOP9]]
 ; CHECK:       outerLatch.2:
 ; CHECK-NEXT:    br label [[INNERH_314:%.*]]
 ; CHECK:       innerH.314:
@@ -566,11 +672,11 @@ define void @test5() {
 ; CHECK-NEXT:    [[C2_3_3:%.*]] = call i1 @unknown(i32 0)
 ; CHECK-NEXT:    br i1 [[C2_3_3]], label [[INNERLATCH_3_3:%.*]], label [[EXITB_LOOPEXIT_LOOPEXIT_LOOPEXIT23]]
 ; CHECK:       innerLatch.3.3:
-; CHECK-NEXT:    br i1 false, label [[INNERH_314]], label [[OUTERLATCH_3]], !llvm.loop [[LOOP6]]
+; CHECK-NEXT:    br i1 false, label [[INNERH_314]], label [[OUTERLATCH_3]], !llvm.loop [[LOOP9]]
 ; CHECK:       outerLatch.3:
 ; CHECK-NEXT:    [[TMP6_3]] = add i32 [[TMP4]], 4
 ; CHECK-NEXT:    [[TMP7_3:%.*]] = icmp sgt i32 [[TMP6_3]], 79
-; CHECK-NEXT:    br i1 [[TMP7_3]], label [[OUTERLATCHEXIT_LOOPEXIT_UNR_LCSSA:%.*]], label [[OUTERH]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7_3]], label [[OUTERLATCHEXIT_LOOPEXIT_UNR_LCSSA:%.*]], label [[OUTERH]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       outerLatchExit.loopexit.unr-lcssa:
 ; CHECK-NEXT:    br label [[OUTERLATCHEXIT_LOOPEXIT]]
 ; CHECK:       outerLatchExit.loopexit:
@@ -676,7 +782,7 @@ define void @test6(i64 %start) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT_PROL]], 616
 ; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
 ; CHECK-NEXT:    [[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
-; CHECK-NEXT:    br i1 [[PROL_ITER_CMP]], label [[HEADER_PROL]], label [[HEADER_PROL_LOOPEXIT_UNR_LCSSA:%.*]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[PROL_ITER_CMP]], label [[HEADER_PROL]], label [[HEADER_PROL_LOOPEXIT_UNR_LCSSA:%.*]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       header.prol.loopexit.unr-lcssa:
 ; CHECK-NEXT:    [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL]], [[LATCH_PROL]] ]
 ; CHECK-NEXT:    br label [[HEADER_PROL_LOOPEXIT]]
@@ -709,7 +815,7 @@ define void @test6(i64 %start) {
 ; CHECK:       latch.3:
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_3]] = add nsw i64 [[INDVARS_IV]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT_3]], 616
-; CHECK-NEXT:    br i1 [[TMP9]], label [[HEADER]], label [[LATCHEXIT_UNR_LCSSA:%.*]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[HEADER]], label [[LATCHEXIT_UNR_LCSSA:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       latchexit.unr-lcssa:
 ; CHECK-NEXT:    br label [[LATCHEXIT]]
 ; CHECK:       latchexit:
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-unroll-after-rotate.ll b/llvm/test/Transforms/LoopUnroll/runtime-unroll-after-rotate.ll
new file mode 100644
index 0000000000000..50afa08a2d7c9
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/runtime-unroll-after-rotate.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt --passes=loop-unroll -unroll-runtime=true -unroll-runtime-other-exit-predictable=1 -S %s | FileCheck %s
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test(i64 %0, ptr %1) #0 {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i64 [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[B1:%.*]] = icmp eq i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[B1]], label %[[AFTER:.*]], label %[[BODY_LR_PH:.*]]
+; CHECK:       [[BODY_LR_PH]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 0, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = freeze i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP2]], 7
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[BODY_PROL_PREHEADER:.*]], label %[[BODY_PROL_LOOPEXIT:.*]]
+; CHECK:       [[BODY_PROL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[BODY_PROL:.*]]
+; CHECK:       [[BODY_PROL]]:
+; CHECK-NEXT:    [[A2_PROL:%.*]] = phi i64 [ [[TMP0]], %[[BODY_PROL_PREHEADER]] ], [ [[A_PROL:%.*]], %[[HEADER_PROL:.*]] ]
+; CHECK-NEXT:    [[PROL_ITER:%.*]] = phi i64 [ 0, %[[BODY_PROL_PREHEADER]] ], [ [[PROL_ITER_NEXT:%.*]], %[[HEADER_PROL]] ]
+; CHECK-NEXT:    [[C_PROL:%.*]] = add i64 [[A2_PROL]], 1
+; CHECK-NEXT:    [[D_PROL:%.*]] = load i32, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[E_PROL:%.*]] = icmp eq i32 [[D_PROL]], 0
+; CHECK-NEXT:    br i1 [[E_PROL]], label %[[END_LOOPEXIT3:.*]], label %[[HEADER_PROL]]
+; CHECK:       [[HEADER_PROL]]:
+; CHECK-NEXT:    [[A_PROL]] = phi i64 [ [[C_PROL]], %[[BODY_PROL]] ]
+; CHECK-NEXT:    [[B_PROL:%.*]] = icmp eq i64 [[A_PROL]], 0
+; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
+; CHECK-NEXT:    [[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
+; CHECK-NEXT:    br i1 [[PROL_ITER_CMP]], label %[[BODY_PROL]], label %[[BODY_PROL_LOOPEXIT_UNR_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[BODY_PROL_LOOPEXIT_UNR_LCSSA]]:
+; CHECK-NEXT:    [[A2_UNR_PH:%.*]] = phi i64 [ [[A_PROL]], %[[HEADER_PROL]] ]
+; CHECK-NEXT:    br label %[[BODY_PROL_LOOPEXIT]]
+; CHECK:       [[BODY_PROL_LOOPEXIT]]:
+; CHECK-NEXT:    [[A2_UNR:%.*]] = phi i64 [ [[TMP0]], %[[BODY_LR_PH]] ], [ [[A2_UNR_PH]], %[[BODY_PROL_LOOPEXIT_UNR_LCSSA]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], 7
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[HEADER_AFTER_CRIT_EDGE:.*]], label %[[BODY_LR_PH_NEW:.*]]
+; CHECK:       [[BODY_LR_PH_NEW]]:
+; CHECK-NEXT:    br label %[[BODY:.*]]
+; CHECK:       [[HEADER:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[END_LOOPEXIT:.*]], label %[[HEADER_1:.*]]
+; CHECK:       [[HEADER_1]]:
+; CHECK-NEXT:    br i1 false, label %[[END_LOOPEXIT]], label %[[HEADER_2:.*]]
+; CHECK:       [[HEADER_2]]:
+; CHECK-NEXT:    br i1 false, label %[[END_LOOPEXIT]], label %[[HEADER_3:.*]]
+; CHECK:       [[HEADER_3]]:
+; CHECK-NEXT:    br i1 false, label %[[END_LOOPEXIT]], label %[[HEADER_4:.*]]
+; CHECK:       [[HEADER_4]]:
+; CHECK-NEXT:    br i1 false, label %[[END_LOOPEXIT]], label %[[HEADER_5:.*]]
+; CHECK:       [[HEADER_5]]:
+; CHECK-NEXT:    br i1 false, label %[[END_LOOPEXIT]], label %[[HEADER_6:.*]]
+; CHECK:       [[HEADER_6]]:
+; CHECK-NEXT:    [[C_7:%.*]] = add i64 [[A2:%.*]], 8
+; CHECK-NEXT:    br i1 false, label %[[END_LOOPEXIT]], label %[[HEADER_7:.*]]
+; CHECK:       [[HEADER_7]]:
+; CHECK-NEXT:    [[B_7:%.*]] = icmp eq i64 [[C_7]], 0
+; CHECK-NEXT:    br i1 [[B_7]], label %[[HEADER_AFTER_CRIT_EDGE_UNR_LCSSA:.*]], label %[[BODY]]
+; CHECK:       [[BODY]]:
+; CHECK-NEXT:    [[A2]] = phi i64 [ [[A2_UNR]], %[[BODY_LR_PH_NEW]] ], [ [[C_7]], %[[HEADER_7]] ]
+; CHECK-NEXT:    [[D:%.*]] = load i32, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[E:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[E]], label %[[END_LOOPEXIT]], label %[[HEADER]]
+; CHECK:       [[END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[END:.*]]
+; CHECK:       [[END_LOOPEXIT3]]:
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[HEADER_AFTER_CRIT_EDGE_UNR_LCSSA]]:
+; CHECK-NEXT:    br label %[[HEADER_AFTER_CRIT_EDGE]]
+; CHECK:       [[HEADER_AFTER_CRIT_EDGE]]:
+; CHECK-NEXT:    br label %[[AFTER]]
+; CHECK:       [[AFTER]]:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %header
+
+header:
+  %a = phi i64 [ %0, %entry ], [ %c, %body ]
+  %b = icmp eq i64 %a, 0
+  br i1 %b, label %after, label %body
+
+body:
+  %c = add i64 %a, 1
+  %d = load i32, ptr %1, align 4
+  %e = icmp eq i32 %d, 0
+  br i1 %e, label %end, label %header
+
+end:
+  ret void
+
+after:
+  call void @foo()
+  ret void
+}
+
+declare void @foo()
+
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"}
+;.
\ No newline at end of file

>From cfb16199e100aa9b33eaca7ffecd28a6930f7340 Mon Sep 17 00:00:00 2001
From: Marek Sedlacek <msedlacek at azul.com>
Date: Tue, 15 Jul 2025 11:24:03 +0000
Subject: [PATCH 2/3] Added flag to enable loop rotation in
 UnrollRuntimeLoopRemainder

---
 .../llvm/Transforms/Utils/UnrollLoop.h        |  2 +-
 llvm/lib/Transforms/Utils/LoopUnroll.cpp      |  2 +-
 .../lib/Transforms/Utils/LoopUnrollAndJam.cpp |  4 +-
 .../Transforms/Utils/LoopUnrollRuntime.cpp    | 37 ++++++++++---------
 .../Transforms/Utils/UnrollLoopTest.cpp       |  3 +-
 5 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index 4499b6f43ba19..d8bcd43cfe7e2 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -111,7 +111,7 @@ LLVM_ABI LoopReminderUnrollResult UnrollRuntimeLoopRemainder(
     LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
     const TargetTransformInfo *TTI, bool PreserveLCSSA,
     unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit,
-    Loop **ResultLoop = nullptr);
+    bool AllowLoopRotation, Loop **ResultLoop = nullptr);
 
 LLVM_ABI LoopUnrollResult UnrollAndJamLoop(
     Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 74662092a014f..3ae288229cc08 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -542,7 +542,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
         L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability,
         ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
         PreserveLCSSA, ULO.SCEVExpansionBudget, ULO.RuntimeUnrollMultiExit,
-        RemainderLoop);
+        /*AllowLoopRotation*/ true, RemainderLoop);
     LatchBlock = L->getLoopLatch();
     LatchIsExiting = L->isLoopExiting(LatchBlock);
   }
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index 43189ddea4603..53da8db9102a2 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -237,12 +237,12 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
   // Are we eliminating the loop control altogether?
   bool CompletelyUnroll = (Count == TripCount);
 
-  // We use the runtime remainder in cases where we don't know trip multiple
   if (TripMultiple % Count != 0) {
     auto UnrollReminderResult = UnrollRuntimeLoopRemainder(
         L, Count, /*AllowExpensiveTripCount*/ false,
         /*UseEpilogRemainder*/ true, UnrollRemainder, /*ForgetAllSCEV*/ false,
-        LI, SE, DT, AC, TTI, true, SCEVCheapExpansionBudget, EpilogueLoop);
+        LI, SE, DT, AC, TTI, /*PreserveLCSSA*/ true, SCEVCheapExpansionBudget,
+        /*AllowLoopRotation*/ false, EpilogueLoop);
     if (UnrollReminderResult != LoopReminderUnrollResult::Unrolled) {
       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
                            "generated when assuming runtime trip count\n");
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 142f97b5c3a1a..ac98e5155d820 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -581,7 +581,7 @@ LoopReminderUnrollResult llvm::UnrollRuntimeLoopRemainder(
     LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
     const TargetTransformInfo *TTI, bool PreserveLCSSA,
     unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit,
-    Loop **ResultLoop) {
+    bool AllowLoopRotation, Loop **ResultLoop) {
   LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
   LLVM_DEBUG(L->dump());
   LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
@@ -590,22 +590,25 @@ LoopReminderUnrollResult llvm::UnrollRuntimeLoopRemainder(
   LoopReminderUnrollResult Result = LoopReminderUnrollResult::Unmodified;
 
   // Rotate loop if it makes the exit count from the latch computable.
-  BasicBlock *OrigHeader = L->getHeader();
-  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
-  if (BI && !BI->isUnconditional() &&
-      isa<SCEVCouldNotCompute>(SE->getExitCount(L, L->getLoopLatch())) &&
-      !isa<SCEVCouldNotCompute>(SE->getExitCount(L, OrigHeader))) {
-    LLVM_DEBUG(
-        dbgs() << "  Rotating loop to make the exit count computable.\n");
-    SimplifyQuery SQ{OrigHeader->getDataLayout()};
-    SQ.TLI = nullptr;
-    SQ.DT = DT;
-    SQ.AC = AC;
-    if (llvm::LoopRotation(L, LI, TTI, AC, DT, SE, nullptr /*MemorySSAUpdater*/,
-                           SQ, false /*RotationOnly*/, 16 /*Threshold*/,
-                           false /*IsUtilMode*/, false /*PrepareForLTO*/,
-                           [](Loop *, ScalarEvolution *) { return true; }))
-      Result = LoopReminderUnrollResult::Rotated;
+  if (AllowLoopRotation) {
+    BasicBlock *OrigHeader = L->getHeader();
+    BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+    if (BI && !BI->isUnconditional() &&
+        isa<SCEVCouldNotCompute>(SE->getExitCount(L, L->getLoopLatch())) &&
+        !isa<SCEVCouldNotCompute>(SE->getExitCount(L, OrigHeader))) {
+      LLVM_DEBUG(
+          dbgs() << "  Rotating loop to make the exit count computable.\n");
+      SimplifyQuery SQ{OrigHeader->getDataLayout()};
+      SQ.TLI = nullptr;
+      SQ.DT = DT;
+      SQ.AC = AC;
+      if (llvm::LoopRotation(L, LI, TTI, AC, DT, SE,
+                             /*MemorySSAUpdater*/ nullptr, SQ,
+                             /*RotationOnly*/ false, /*Threshold*/ 16,
+                             /*IsUtilMode*/ false, /*PrepareForLTO*/ false,
+                             [](Loop *, ScalarEvolution *) { return true; }))
+        Result = LoopReminderUnrollResult::Rotated;
+    }
   }
 
   // Make sure the loop is in canonical form.
diff --git a/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp b/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
index eec10110e6af4..76062609bf200 100644
--- a/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
+++ b/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
@@ -73,6 +73,7 @@ while.end:                                        ; preds = %while.cond
 
   bool ret =
       UnrollRuntimeLoopRemainder(L, 4, true, false, false, false, &LI, &SE, &DT,
-                                 &AC, /*TTI=*/nullptr, PreserveLCSSA, 4, false);
+                                 &AC, /*TTI=*/nullptr, PreserveLCSSA, 4, false,
+                                 false);
   EXPECT_FALSE(ret);
 }

>From b2e1fe6271cc614bad768f75a5956e11cdda7272 Mon Sep 17 00:00:00 2001
From: Marek Sedlacek <msedlacek at azul.com>
Date: Tue, 15 Jul 2025 13:07:37 +0000
Subject: [PATCH 3/3] Try reminder unrolling, when checks fail, rotate loop and
 retry

---
 .../llvm/Transforms/Utils/UnrollLoop.h        | 16 +-----
 llvm/lib/Transforms/Utils/LoopUnroll.cpp      | 50 +++++++++++++----
 .../lib/Transforms/Utils/LoopUnrollAndJam.cpp | 16 +++---
 .../Transforms/Utils/LoopUnrollRuntime.cpp    | 53 +++++--------------
 4 files changed, 63 insertions(+), 72 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index d8bcd43cfe7e2..395e78f3c1f8d 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -72,18 +72,6 @@ enum class LoopUnrollResult {
   FullyUnrolled
 };
 
-/// Represents the result of a \c UnrollRuntimeLoopRemainder invocation.
-enum class LoopReminderUnrollResult {
-  /// The loop reminder was not modified.
-  Unmodified,
-
-  /// The loop was rotated, but not unrolled.
-  Rotated,
-
-  /// The loop reminder was unrolled.
-  Unrolled
-};
-
 struct UnrollLoopOptions {
   unsigned Count;
   bool Force;
@@ -105,13 +93,13 @@ LLVM_ABI LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO,
                                      Loop **RemainderLoop = nullptr,
                                      AAResults *AA = nullptr);
 
-LLVM_ABI LoopReminderUnrollResult UnrollRuntimeLoopRemainder(
+LLVM_ABI bool UnrollRuntimeLoopRemainder(
     Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
     LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
     const TargetTransformInfo *TTI, bool PreserveLCSSA,
     unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit,
-    bool AllowLoopRotation, Loop **ResultLoop = nullptr);
+    Loop **ResultLoop = nullptr);
 
 LLVM_ABI LoopUnrollResult UnrollAndJamLoop(
     Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 3ae288229cc08..a3c6d9cff24f6 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -58,6 +58,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopRotationUtils.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
@@ -535,20 +536,51 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
                                               : isEpilogProfitable(L);
 
-  LoopReminderUnrollResult UnrollReminderResult =
-      LoopReminderUnrollResult::Unmodified;
+  bool LoopRotated = false;
+  bool ReminderUnrolled = false;
   if (ULO.Runtime) {
-    UnrollReminderResult = UnrollRuntimeLoopRemainder(
+    // Call unroll with disabled rotation, to see if it is possible without it.
+    ReminderUnrolled = UnrollRuntimeLoopRemainder(
         L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability,
         ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
         PreserveLCSSA, ULO.SCEVExpansionBudget, ULO.RuntimeUnrollMultiExit,
-        /*AllowLoopRotation*/ true, RemainderLoop);
+        RemainderLoop);
+
+    // If unroll is not possible, then try with loop rotation.
+    if (!ReminderUnrolled) {
+      BasicBlock *OrigHeader = L->getHeader();
+      BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+      if (BI && !BI->isUnconditional() &&
+          isa<SCEVCouldNotCompute>(SE->getExitCount(L, L->getLoopLatch())) &&
+          !isa<SCEVCouldNotCompute>(SE->getExitCount(L, OrigHeader))) {
+        LLVM_DEBUG(
+            dbgs() << "  Rotating loop to make the exit count computable.\n");
+        SimplifyQuery SQ{OrigHeader->getDataLayout()};
+        SQ.TLI = nullptr;
+        SQ.DT = DT;
+        SQ.AC = AC;
+        LoopRotated =
+            llvm::LoopRotation(L, LI, TTI, AC, DT, SE,
+                               /*MemorySSAUpdater*/ nullptr, SQ,
+                               /*RotationOnly*/ false, /*Threshold*/ 16,
+                               /*IsUtilMode*/ false, /*PrepareForLTO*/ false,
+                               [](Loop *, ScalarEvolution *) { return true; });
+      }
+      if (LoopRotated) {
+        // Loop was rotated, try unrolling.
+        ReminderUnrolled = UnrollRuntimeLoopRemainder(
+            L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability,
+            ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
+            PreserveLCSSA, ULO.SCEVExpansionBudget, ULO.RuntimeUnrollMultiExit,
+            RemainderLoop);
+      }
+    }
+    // Latch block needs to be updated.
     LatchBlock = L->getLoopLatch();
     LatchIsExiting = L->isLoopExiting(LatchBlock);
   }
 
-  if (ULO.Runtime &&
-      UnrollReminderResult != LoopReminderUnrollResult::Unrolled) {
+  if (ULO.Runtime && !ReminderUnrolled) {
     if (ULO.Force)
       ULO.Runtime = false;
     else {
@@ -556,10 +588,8 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
                            "generated when assuming runtime trip count\n");
       // Loop might have been rotated inside of UnrollRuntimeLoopRemainder and
       // this needs to be propagated.
-      return UnrollReminderResult == LoopReminderUnrollResult::Rotated
-                 ? LoopUnrollResult::Modified
-                 : LoopUnrollResult::Unmodified;
-      ;
+      return LoopRotated ? LoopUnrollResult::Modified
+                         : LoopUnrollResult::Unmodified;
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index 53da8db9102a2..ca90bb65f5708 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -237,18 +237,16 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
   // Are we eliminating the loop control altogether?
   bool CompletelyUnroll = (Count == TripCount);
 
+  // We use the runtime remainder in cases where we don't know trip multiple
   if (TripMultiple % Count != 0) {
-    auto UnrollReminderResult = UnrollRuntimeLoopRemainder(
-        L, Count, /*AllowExpensiveTripCount*/ false,
-        /*UseEpilogRemainder*/ true, UnrollRemainder, /*ForgetAllSCEV*/ false,
-        LI, SE, DT, AC, TTI, /*PreserveLCSSA*/ true, SCEVCheapExpansionBudget,
-        /*AllowLoopRotation*/ false, EpilogueLoop);
-    if (UnrollReminderResult != LoopReminderUnrollResult::Unrolled) {
+    if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
+                                    /*UseEpilogRemainder*/ true,
+                                    UnrollRemainder, /*ForgetAllSCEV*/ false,
+                                    LI, SE, DT, AC, TTI, true,
+                                    SCEVCheapExpansionBudget, EpilogueLoop)) {
       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
                            "generated when assuming runtime trip count\n");
-      return UnrollReminderResult == LoopReminderUnrollResult::Rotated
-                 ? LoopUnrollResult::Modified
-                 : LoopUnrollResult::Unmodified;
+      return LoopUnrollResult::Unmodified;
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index ac98e5155d820..bf882d7406853 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -37,7 +37,6 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopRotationUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
@@ -575,46 +574,22 @@ static Value *CreateTripRemainder(IRBuilder<> &B, Value *BECount,
 ///        if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
 /// EpilExit:
 
-LoopReminderUnrollResult llvm::UnrollRuntimeLoopRemainder(
+bool llvm::UnrollRuntimeLoopRemainder(
     Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
     LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
     const TargetTransformInfo *TTI, bool PreserveLCSSA,
     unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit,
-    bool AllowLoopRotation, Loop **ResultLoop) {
+    Loop **ResultLoop) {
   LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
   LLVM_DEBUG(L->dump());
   LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
                                 : dbgs() << "Using prolog remainder.\n");
 
-  LoopReminderUnrollResult Result = LoopReminderUnrollResult::Unmodified;
-
-  // Rotate loop if it makes the exit count from the latch computable.
-  if (AllowLoopRotation) {
-    BasicBlock *OrigHeader = L->getHeader();
-    BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
-    if (BI && !BI->isUnconditional() &&
-        isa<SCEVCouldNotCompute>(SE->getExitCount(L, L->getLoopLatch())) &&
-        !isa<SCEVCouldNotCompute>(SE->getExitCount(L, OrigHeader))) {
-      LLVM_DEBUG(
-          dbgs() << "  Rotating loop to make the exit count computable.\n");
-      SimplifyQuery SQ{OrigHeader->getDataLayout()};
-      SQ.TLI = nullptr;
-      SQ.DT = DT;
-      SQ.AC = AC;
-      if (llvm::LoopRotation(L, LI, TTI, AC, DT, SE,
-                             /*MemorySSAUpdater*/ nullptr, SQ,
-                             /*RotationOnly*/ false, /*Threshold*/ 16,
-                             /*IsUtilMode*/ false, /*PrepareForLTO*/ false,
-                             [](Loop *, ScalarEvolution *) { return true; }))
-        Result = LoopReminderUnrollResult::Rotated;
-    }
-  }
-
   // Make sure the loop is in canonical form.
   if (!L->isLoopSimplifyForm()) {
     LLVM_DEBUG(dbgs() << "Not in simplify form!\n");
-    return Result;
+    return false;
   }
 
   // Guaranteed by LoopSimplifyForm.
@@ -628,7 +603,7 @@ LoopReminderUnrollResult llvm::UnrollRuntimeLoopRemainder(
     LLVM_DEBUG(
         dbgs()
         << "Loop latch not terminated by a conditional branch.\n");
-    return Result;
+    return false;
   }
 
   unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
@@ -640,7 +615,7 @@ LoopReminderUnrollResult llvm::UnrollRuntimeLoopRemainder(
     LLVM_DEBUG(
         dbgs()
         << "One of the loop latch successors must be the exit block.\n");
-    return Result;
+    return false;
   }
 
   // These are exit blocks other than the target of the latch exiting block.
@@ -652,12 +627,12 @@ LoopReminderUnrollResult llvm::UnrollRuntimeLoopRemainder(
     // We rely on LCSSA form being preserved when the exit blocks are transformed.
     // (Note that only an off-by-default mode of the old PM disables PreserveLCCA.)
     if (!PreserveLCSSA)
-      return Result;
+      return false;
 
     // Priority goes to UnrollRuntimeMultiExit if it's supplied.
     if (UnrollRuntimeMultiExit.getNumOccurrences()) {
       if (!UnrollRuntimeMultiExit)
-        return Result;
+        return false;
     } else {
       // Otherwise perform multi-exit unrolling, if either the target indicates
       // it is profitable or the general profitability heuristics apply.
@@ -666,14 +641,14 @@ LoopReminderUnrollResult llvm::UnrollRuntimeLoopRemainder(
                                                    UseEpilogRemainder)) {
         LLVM_DEBUG(dbgs() << "Multiple exit/exiting blocks in loop and "
                              "multi-exit unrolling not enabled!\n");
-        return Result;
+        return false;
       }
     }
   }
   // Use Scalar Evolution to compute the trip count. This allows more loops to
   // be unrolled than relying on induction var simplification.
   if (!SE)
-    return Result;
+    return false;
 
   // Only unroll loops with a computable trip count.
   // We calculate the backedge count by using getExitCount on the Latch block,
@@ -683,7 +658,7 @@ LoopReminderUnrollResult llvm::UnrollRuntimeLoopRemainder(
   const SCEV *BECountSC = SE->getExitCount(L, Latch);
   if (isa<SCEVCouldNotCompute>(BECountSC)) {
     LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n");
-    return Result;
+    return false;
   }
 
   unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
@@ -694,7 +669,7 @@ LoopReminderUnrollResult llvm::UnrollRuntimeLoopRemainder(
       SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
   if (isa<SCEVCouldNotCompute>(TripCountSC)) {
     LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
-    return Result;
+    return false;
   }
 
   BasicBlock *PreHeader = L->getLoopPreheader();
@@ -705,7 +680,7 @@ LoopReminderUnrollResult llvm::UnrollRuntimeLoopRemainder(
       Expander.isHighCostExpansion(TripCountSC, L, SCEVExpansionBudget, TTI,
                                    PreHeaderBR)) {
     LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
-    return Result;
+    return false;
   }
 
   // This constraint lets us deal with an overflowing trip count easily; see the
@@ -714,7 +689,7 @@ LoopReminderUnrollResult llvm::UnrollRuntimeLoopRemainder(
     LLVM_DEBUG(
         dbgs()
         << "Count failed constraint on overflow trip count calculation.\n");
-    return Result;
+    return false;
   }
 
   // Loop structure is the following:
@@ -1060,5 +1035,5 @@ LoopReminderUnrollResult llvm::UnrollRuntimeLoopRemainder(
   if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)
     *ResultLoop = remainderLoop;
   NumRuntimeUnrolled++;
-  return LoopReminderUnrollResult::Unrolled;
+  return true;
 }



More information about the llvm-commits mailing list