[llvm] [LoopPeel] Implement initial peeling off the last loop iteration. (PR #139551)

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Thu May 15 04:58:23 PDT 2025


https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/139551

>From 638356ecd23d6de9a1d836b6cd3d0b5e3f7b4640 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 6 May 2025 22:06:38 +0100
Subject: [PATCH 1/5] [LoopPeel] Implement initial peeling off the last loop
 iteration.

Generalize countToEliminateCompares to also consider peeling off the
last iteration if it eliminates a compare. At the moment, codegen for
peeling off the last iteration is quite restrictive and callers have to
make sure that the exit condition can be adjusted when peeling and that
the loop executes at least 2 iterations. Both will be relaxed in
follow-ups.
---
 .../llvm/Analysis/TargetTransformInfo.h       |   3 +
 llvm/include/llvm/Transforms/Utils/LoopPeel.h |  11 +-
 llvm/lib/Transforms/Scalar/LoopFuse.cpp       |   3 +-
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp |   3 +-
 llvm/lib/Transforms/Utils/LoopPeel.cpp        | 364 ++++++++++++------
 .../LoopUnroll/peel-last-iteration.ll         |  93 +++--
 6 files changed, 314 insertions(+), 163 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 3f639138d8b75..1aed98e8f50db 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -680,6 +680,9 @@ class TargetTransformInfo {
     /// If the value is true the peeling cost model can decide to peel only
     /// some iterations and in this case it will set this to false.
     bool PeelProfiledIterations;
+
+    /// Peel off the last PeelCount loop iterations.
+    bool PeelLast;
   };
 
   /// Get target-customized preferences for the generic loop peeling
diff --git a/llvm/include/llvm/Transforms/Utils/LoopPeel.h b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
index 0b78700ca71bb..f7babaf036768 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopPeel.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
@@ -22,10 +22,13 @@ namespace llvm {
 bool canPeel(const Loop *L);
 
 /// VMap is the value-map that maps instructions from the original loop to
-/// instructions in the last peeled-off iteration.
-bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE,
-              DominatorTree &DT, AssumptionCache *AC, bool PreserveLCSSA,
-              ValueToValueMapTy &VMap);
+/// instructions in the last peeled-off iteration. If \p PeelLast is true, peel
+/// off the last \p PeelCount iterations from \p L. In that case, the caller has
+/// to make sure that the exit condition can be adjusted when peeling and that
+/// the loop executes at least 2 iterations.
+bool peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
+              ScalarEvolution *SE, DominatorTree &DT, AssumptionCache *AC,
+              bool PreserveLCSSA, ValueToValueMapTy &VMap);
 
 TargetTransformInfo::PeelingPreferences
 gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 5bba3016ba4a1..d6bd92d520e28 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -790,7 +790,8 @@ struct LoopFuser {
                       << " iterations of the first loop. \n");
 
     ValueToValueMapTy VMap;
-    FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, DT, &AC, true, VMap);
+    FC0.Peeled =
+        peelLoop(FC0.L, PeelCount, false, &LI, &SE, DT, &AC, true, VMap);
     if (FC0.Peeled) {
       LLVM_DEBUG(dbgs() << "Done Peeling\n");
 
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index d84b74dd0eecc..0b9fee5727c6f 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1314,7 +1314,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
     });
 
     ValueToValueMapTy VMap;
-    if (peelLoop(L, PP.PeelCount, LI, &SE, DT, &AC, PreserveLCSSA, VMap)) {
+    if (peelLoop(L, PP.PeelCount, PP.PeelLast, LI, &SE, DT, &AC, PreserveLCSSA,
+                 VMap)) {
       simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI, nullptr);
       // If the loop was peeled, we already "used up" the profile information
       // we had, so we don't want to unroll or peel again.
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index f6ace9c4e5d2f..f01c3948b87ed 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -49,6 +49,7 @@ using namespace llvm::PatternMatch;
 #define DEBUG_TYPE "loop-peel"
 
 STATISTIC(NumPeeled, "Number of loops peeled");
+STATISTIC(NumPeeledEnd, "Number of loops peeled from end");
 
 static cl::opt<unsigned> UnrollPeelCount(
     "unroll-peel-count", cl::Hidden,
@@ -325,19 +326,66 @@ static unsigned peelToTurnInvariantLoadsDerefencebale(Loop &L,
   return 0;
 }
 
+/// Returns true if the last iteration can be peeled off and the condition (Pred
+/// LeftAR, RightSCEV) is known at the last iteration and the inverse condition
+/// is known at the second-to-last. This function also has to make sure the loop
+/// exit condition can be adjusted when peeling and that the loop executes at
+/// least 2 iterations.
+static bool canPeelLastIteration(Loop &L, const SCEVAddRecExpr *LeftAR,
+                                 const SCEV *RightSCEV, ScalarEvolution &SE) {
+  const SCEV *BTC = SE.getBackedgeTakenCount(&L);
+  Value *Inc;
+  CmpPredicate Pred;
+  BasicBlock *Succ1;
+  BasicBlock *Succ2;
+  // The loop must execute at least 2 iterations to guarantee that peeled
+  // iteration executes.
+  // TODO: Add checks during codegen.
+  if (isa<SCEVCouldNotCompute>(BTC) ||
+      !SE.isKnownPredicate(CmpInst::ICMP_UGT, BTC, SE.getOne(BTC->getType())))
+    return false;
+
+  // Check if the exit condition of the loop can be adjusted by the peeling
+  // codegen. For now, it must
+  // * exit via the latch,
+  // * the exit condition must be a NE/EQ compare of an induction with step
+  // of 1.
+  BasicBlock *Latch = L.getLoopLatch();
+  if (Latch != L.getExitingBlock() ||
+      !match(Latch->getTerminator(),
+             m_Br(m_ICmp(Pred, m_Value(Inc), m_Value()), m_BasicBlock(Succ1),
+                  m_BasicBlock(Succ2))) ||
+      !((Pred == CmpInst::ICMP_EQ && Succ2 == L.getHeader()) ||
+        (Pred == CmpInst::ICMP_NE && Succ1 == L.getHeader())) ||
+      !isa<SCEVAddRecExpr>(SE.getSCEV(Inc)) ||
+      !cast<SCEVAddRecExpr>(SE.getSCEV(Inc))->getStepRecurrence(SE)->isOne())
+    return false;
+
+  const SCEV *ValAtLastIter =
+      SE.applyLoopGuards(LeftAR->evaluateAtIteration(BTC, SE), &L);
+  const SCEV *ValAtSecondToLastIter = LeftAR->evaluateAtIteration(
+      SE.getMinusSCEV(BTC, SE.getOne(BTC->getType())), SE);
+
+  return SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), ValAtLastIter,
+                             SE.applyLoopGuards(RightSCEV, &L)) &&
+         SE.isKnownPredicate(Pred, ValAtSecondToLastIter, RightSCEV);
+}
+
 // Return the number of iterations to peel off that make conditions in the
-// body true/false. For example, if we peel 2 iterations off the loop below,
-// the condition i < 2 can be evaluated at compile time.
+// body true/false. Positive return values indicate the iterations to peel of
+// from the front and negative return values indicate the number of iterations
+// from the back after removing the sign. For example, if we peel 2 iterations
+// off the loop below, the condition i < 2 can be evaluated at compile time.
 //  for (i = 0; i < n; i++)
 //    if (i < 2)
 //      ..
 //    else
 //      ..
 //   }
-static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
-                                         ScalarEvolution &SE) {
+static int countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
+                                    ScalarEvolution &SE) {
   assert(L.isLoopSimplifyForm() && "Loop needs to be in loop simplify form");
-  unsigned DesiredPeelCount = 0;
+  int DesiredPeelCount = 0;
 
   // Do not peel the entire loop.
   const SCEV *BE = SE.getConstantMaxBackedgeTakenCount(&L);
@@ -349,9 +397,9 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
   // return true if inversed condition become known before reaching the
   // MaxPeelCount limit.
   auto PeelWhilePredicateIsKnown =
-      [&](unsigned &PeelCount, const SCEV *&IterVal, const SCEV *BoundSCEV,
+      [&](int &PeelCount, const SCEV *&IterVal, const SCEV *BoundSCEV,
           const SCEV *Step, ICmpInst::Predicate Pred) {
-        while (PeelCount < MaxPeelCount &&
+        while (unsigned(std::abs(PeelCount)) < MaxPeelCount &&
                SE.isKnownPredicate(Pred, IterVal, BoundSCEV)) {
           IterVal = SE.getAddExpr(IterVal, Step);
           ++PeelCount;
@@ -408,7 +456,7 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
 
     // Check if extending the current DesiredPeelCount lets us evaluate Pred
     // or !Pred in the loop body statically.
-    unsigned NewPeelCount = DesiredPeelCount;
+    int NewPeelCount = DesiredPeelCount;
 
     const SCEV *IterVal = LeftAR->evaluateAtIteration(
         SE.getConstant(LeftSCEV->getType(), NewPeelCount), SE);
@@ -421,8 +469,11 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
 
     const SCEV *Step = LeftAR->getStepRecurrence(SE);
     if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, RightSCEV, Step,
-                                   Pred))
+                                   Pred)) {
+      if (canPeelLastIteration(L, LeftAR, RightSCEV, SE))
+        DesiredPeelCount = -1;
       return;
+    }
 
     // However, for equality comparisons, that isn't always sufficient to
     // eliminate the comparsion in loop body, we may need to peel one more
@@ -433,7 +484,7 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
                              RightSCEV) &&
         !SE.isKnownPredicate(Pred, IterVal, RightSCEV) &&
         SE.isKnownPredicate(Pred, NextIterVal, RightSCEV)) {
-      if (NewPeelCount >= MaxPeelCount)
+      if (unsigned(std::abs(NewPeelCount)) >= MaxPeelCount)
         return; // Need to peel one more iteration, but can't. Give up.
       ++NewPeelCount; // Great!
     }
@@ -472,7 +523,7 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
     // Check that AddRec is not wrapping.
     if (!(IsSigned ? AddRec->hasNoSignedWrap() : AddRec->hasNoUnsignedWrap()))
       return;
-    unsigned NewPeelCount = DesiredPeelCount;
+    int NewPeelCount = DesiredPeelCount;
     const SCEV *IterVal = AddRec->evaluateAtIteration(
         SE.getConstant(AddRec->getType(), NewPeelCount), SE);
     if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, BoundSCEV, Step,
@@ -593,8 +644,9 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
       DesiredPeelCount = std::max(DesiredPeelCount, *NumPeels);
   }
 
-  DesiredPeelCount = std::max(DesiredPeelCount,
-                              countToEliminateCompares(*L, MaxPeelCount, SE));
+  int CountToEliminateCmps = countToEliminateCompares(*L, MaxPeelCount, SE);
+  DesiredPeelCount =
+      std::max(DesiredPeelCount, unsigned(std::abs(CountToEliminateCmps)));
 
   if (DesiredPeelCount == 0)
     DesiredPeelCount = peelToTurnInvariantLoadsDerefencebale(*L, DT, AC);
@@ -609,6 +661,9 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
                         << " some Phis into invariants.\n");
       PP.PeelCount = DesiredPeelCount;
       PP.PeelProfiledIterations = false;
+      PP.PeelLast =
+          DesiredPeelCount == unsigned(std::abs(CountToEliminateCmps)) &&
+          CountToEliminateCmps < 0;
       return;
     }
   }
@@ -733,6 +788,7 @@ static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos,
 /// InsertBot.
 /// \param IterNumber The serial number of the iteration currently being
 /// peeled off.
+/// \param PeelLast Peel off the last iterations from \p L.
 /// \param ExitEdges The exit edges of the original loop.
 /// \param[out] NewBlocks A list of the blocks in the newly created clone
 /// \param[out] VMap The value map between the loop and the new clone.
@@ -740,7 +796,8 @@ static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos,
 /// \param LVMap A value-map that maps instructions from the original loop to
 /// instructions in the last peeled-off iteration.
 static void cloneLoopBlocks(
-    Loop *L, unsigned IterNumber, BasicBlock *InsertTop, BasicBlock *InsertBot,
+    Loop *L, unsigned IterNumber, bool PeelLast, BasicBlock *InsertTop,
+    BasicBlock *InsertBot,
     SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *>> &ExitEdges,
     SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
     ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT,
@@ -804,16 +861,26 @@ static void cloneLoopBlocks(
 
   // Similarly, for the latch:
   // The original exiting edge is still hooked up to the loop exit.
-  // The backedge now goes to the "bottom", which is either the loop's real
-  // header (for the last peeled iteration) or the copied header of the next
-  // iteration (for every other iteration)
   BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
-  auto *LatchTerm = cast<Instruction>(NewLatch->getTerminator());
-  for (unsigned idx = 0, e = LatchTerm->getNumSuccessors(); idx < e; ++idx)
-    if (LatchTerm->getSuccessor(idx) == Header) {
-      LatchTerm->setSuccessor(idx, InsertBot);
-      break;
+  if (PeelLast) {
+    // This is the last iteration and we definitely will go to the exit. Just
+    // set both successors to InsertBot and let the branch be simplified later.
+    assert(IterNumber == 0 && "Only peeling a single iteration implemented.");
+    auto *LatchTerm = cast<BranchInst>(NewLatch->getTerminator());
+    LatchTerm->setSuccessor(0, InsertBot);
+    LatchTerm->setSuccessor(1, InsertBot);
+  } else {
+    auto *LatchTerm = cast<Instruction>(NewLatch->getTerminator());
+    // The backedge now goes to the "bottom", which is either the loop's real
+    // header (for the last peeled iteration) or the copied header of the next
+    // iteration (for every other iteration)
+    for (unsigned idx = 0, e = LatchTerm->getNumSuccessors(); idx < e; ++idx) {
+      if (LatchTerm->getSuccessor(idx) == Header) {
+        LatchTerm->setSuccessor(idx, InsertBot);
+        break;
+      }
     }
+  }
   if (DT)
     DT->changeImmediateDominator(InsertBot, NewLatch);
 
@@ -821,23 +888,33 @@ static void cloneLoopBlocks(
   // that pick an incoming value from either the preheader, or the previous
   // loop iteration. Since this copy is no longer part of the loop, we
   // resolve this statically:
-  // For the first iteration, we use the value from the preheader directly.
-  // For any other iteration, we replace the phi with the value generated by
-  // the immediately preceding clone of the loop body (which represents
-  // the previous iteration).
-  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
-    PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
-    if (IterNumber == 0) {
-      VMap[&*I] = NewPHI->getIncomingValueForBlock(PreHeader);
-    } else {
-      Value *LatchVal = NewPHI->getIncomingValueForBlock(Latch);
-      Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);
-      if (LatchInst && L->contains(LatchInst))
-        VMap[&*I] = LVMap[LatchInst];
-      else
-        VMap[&*I] = LatchVal;
+  if (PeelLast) {
+    // For the last iteration, we use the value from the latch of the original
+    // loop directly.
+    for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+      PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
+      VMap[&*I] = NewPHI->getIncomingValueForBlock(Latch);
+      NewPHI->eraseFromParent();
+    }
+  } else {
+    // For the first iteration, we use the value from the preheader directly.
+    // For any other iteration, we replace the phi with the value generated by
+    // the immediately preceding clone of the loop body (which represents
+    // the previous iteration).
+    for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+      PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
+      if (IterNumber == 0) {
+        VMap[&*I] = NewPHI->getIncomingValueForBlock(PreHeader);
+      } else {
+        Value *LatchVal = NewPHI->getIncomingValueForBlock(Latch);
+        Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);
+        if (LatchInst && L->contains(LatchInst))
+          VMap[&*I] = LVMap[LatchInst];
+        else
+          VMap[&*I] = LatchVal;
+      }
+      NewPHI->eraseFromParent();
     }
-    NewPHI->eraseFromParent();
   }
 
   // Fix up the outgoing values - we need to add a value for the iteration
@@ -905,11 +982,13 @@ llvm::gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
 /// this provides a benefit, since the peeled off iterations, which account
 /// for the bulk of dynamic execution, can be further simplified by scalar
 /// optimizations.
-bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
+bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
                     ScalarEvolution *SE, DominatorTree &DT, AssumptionCache *AC,
                     bool PreserveLCSSA, ValueToValueMapTy &LVMap) {
   assert(PeelCount > 0 && "Attempt to peel out zero iterations?");
   assert(canPeel(L) && "Attempt to peel a loop which is not peelable?");
+  assert((!PeelLast || PeelCount == 1) &&
+         "can only peel off a single iteration from the end for now");
 
   LoopBlocksDFS LoopBlocks(L);
   LoopBlocks.perform(LI);
@@ -944,60 +1023,99 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
 
   Function *F = Header->getParent();
 
-  // Set up all the necessary basic blocks. It is convenient to split the
-  // preheader into 3 parts - two blocks to anchor the peeled copy of the loop
-  // body, and a new preheader for the "real" loop.
-
-  // Peeling the first iteration transforms.
-  //
-  // PreHeader:
-  // ...
-  // Header:
-  //   LoopBody
-  //   If (cond) goto Header
-  // Exit:
-  //
-  // into
-  //
-  // InsertTop:
-  //   LoopBody
-  //   If (!cond) goto Exit
-  // InsertBot:
-  // NewPreHeader:
-  // ...
-  // Header:
-  //  LoopBody
-  //  If (cond) goto Header
-  // Exit:
-  //
-  // Each following iteration will split the current bottom anchor in two,
-  // and put the new copy of the loop body between these two blocks. That is,
-  // after peeling another iteration from the example above, we'll split
-  // InsertBot, and get:
-  //
-  // InsertTop:
-  //   LoopBody
-  //   If (!cond) goto Exit
-  // InsertBot:
-  //   LoopBody
-  //   If (!cond) goto Exit
-  // InsertBot.next:
-  // NewPreHeader:
-  // ...
-  // Header:
-  //  LoopBody
-  //  If (cond) goto Header
-  // Exit:
-
-  BasicBlock *InsertTop = SplitEdge(PreHeader, Header, &DT, LI);
-  BasicBlock *InsertBot =
-      SplitBlock(InsertTop, InsertTop->getTerminator(), &DT, LI);
-  BasicBlock *NewPreHeader =
-      SplitBlock(InsertBot, InsertBot->getTerminator(), &DT, LI);
-
-  InsertTop->setName(Header->getName() + ".peel.begin");
-  InsertBot->setName(Header->getName() + ".peel.next");
-  NewPreHeader->setName(PreHeader->getName() + ".peel.newph");
+  // Set up all the necessary basic blocks.
+  BasicBlock *InsertTop;
+  BasicBlock *InsertBot;
+  BasicBlock *NewPreHeader;
+  DenseMap<Instruction *, Value *> ExitValues;
+  if (PeelLast) {
+    // It is convenient to split the single exit block from the latch the
+    // into 3 parts - two blocks to anchor the peeled copy of the loop body,
+    // and a new final  exit block.
+
+    // Peeling the last iteration transforms.
+    //
+    // PreHeader:
+    // ...
+    // Header:
+    //   LoopBody
+    //   If (cond) goto Header
+    // Exit:
+    //
+    // into
+    //
+    // Header:
+    //  LoopBody
+    //  If (cond) goto Header
+    // InsertTop:
+    //   LoopBody
+    //   If (!cond) goto InsertBot
+    // InsertBot:
+    // Exit:
+    // ...
+    BasicBlock *Exit = L->getExitBlock();
+    for (PHINode &P : Exit->phis())
+      ExitValues[&P] = P.getIncomingValueForBlock(Latch);
+
+    InsertTop = SplitEdge(Latch, Exit, &DT, LI);
+    InsertBot = SplitBlock(InsertTop, InsertTop->getTerminator(), &DT, LI);
+
+    InsertTop->setName(Exit->getName() + ".peel.begin");
+    InsertBot->setName(Exit->getName() + ".peel.next");
+  } else {
+    // It is convenient to split the preheader into 3 parts - two blocks to
+    // anchor the peeled copy of the loop body, and a new preheader for the
+    // "real" loop.
+
+    // Peeling the first iteration transforms.
+    //
+    // PreHeader:
+    // ...
+    // Header:
+    //   LoopBody
+    //   If (cond) goto Header
+    // Exit:
+    //
+    // into
+    //
+    // InsertTop:
+    //   LoopBody
+    //   If (!cond) goto Exit
+    // InsertBot:
+    // NewPreHeader:
+    // ...
+    // Header:
+    //  LoopBody
+    //  If (cond) goto Header
+    // Exit:
+    //
+    // Each following iteration will split the current bottom anchor in two,
+    // and put the new copy of the loop body between these two blocks. That
+    // is, after peeling another iteration from the example above, we'll
+    // split InsertBot, and get:
+    //
+    // InsertTop:
+    //   LoopBody
+    //   If (!cond) goto Exit
+    // InsertBot:
+    //   LoopBody
+    //   If (!cond) goto Exit
+    // InsertBot.next:
+    // NewPreHeader:
+    // ...
+    // Header:
+    //  LoopBody
+    //  If (cond) goto Header
+    // Exit:
+    //
+    InsertTop = SplitEdge(PreHeader, Header, &DT, LI);
+    InsertBot = SplitBlock(InsertTop, InsertTop->getTerminator(), &DT, LI);
+    NewPreHeader = SplitBlock(InsertBot, InsertBot->getTerminator(), &DT, LI);
+
+    InsertTop->setName(Header->getName() + ".peel.begin");
+    InsertBot->setName(Header->getName() + ".peel.next");
+    NewPreHeader->setName(PreHeader->getName() + ".peel.newph");
+  }
 
   Instruction *LatchTerm =
       cast<Instruction>(cast<BasicBlock>(Latch)->getTerminator());
@@ -1013,23 +1131,40 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
   identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes);
 
   // For each peeled-off iteration, make a copy of the loop.
+  ValueToValueMapTy VMap;
   for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {
     SmallVector<BasicBlock *, 8> NewBlocks;
-    ValueToValueMapTy VMap;
 
-    cloneLoopBlocks(L, Iter, InsertTop, InsertBot, ExitEdges, NewBlocks,
-                    LoopBlocks, VMap, LVMap, &DT, LI,
+    cloneLoopBlocks(L, Iter, PeelLast, InsertTop, InsertBot, ExitEdges,
+                    NewBlocks, LoopBlocks, VMap, LVMap, &DT, LI,
                     LoopLocalNoAliasDeclScopes, *SE);
 
     // Remap to use values from the current iteration instead of the
     // previous one.
     remapInstructionsInBlocks(NewBlocks, VMap);
 
-    // Update IDoms of the blocks reachable through exits.
-    if (Iter == 0)
-      for (auto BBIDom : NonLoopBlocksIDom)
-        DT.changeImmediateDominator(BBIDom.first,
-                                     cast<BasicBlock>(LVMap[BBIDom.second]));
+    if (Iter == 0) {
+      if (PeelLast) {
+        // Adjust the exit condition so the loop exits one iteration early.
+        // For now we simply subtract one form the second operand of the
+        // exit condition. This relies on the peel count computation to
+        // check that this is actually legal. In particular, it ensures that
+        // the first operand of the compare is an AddRec with step 1 and we
+        // execute more than one iteration.
+        auto *Cmp =
+            cast<ICmpInst>(L->getLoopLatch()->getTerminator()->getOperand(0));
+        IRBuilder B(Cmp);
+        Cmp->setOperand(
+            1, B.CreateSub(Cmp->getOperand(1),
+                           ConstantInt::get(Cmp->getOperand(1)->getType(), 1)));
+      } else {
+        // Update IDoms of the blocks reachable through exits.
+        for (auto BBIDom : NonLoopBlocksIDom)
+          DT.changeImmediateDominator(BBIDom.first,
+                                      cast<BasicBlock>(LVMap[BBIDom.second]));
+      }
+    }
+
 #ifdef EXPENSIVE_CHECKS
     assert(DT.verify(DominatorTree::VerificationLevel::Fast));
 #endif
@@ -1052,16 +1187,24 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
               F->end());
   }
 
-  // Now adjust the phi nodes in the loop header to get their initial values
-  // from the last peeled-off iteration instead of the preheader.
-  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PHI = cast<PHINode>(I);
-    Value *NewVal = PHI->getIncomingValueForBlock(Latch);
-    Instruction *LatchInst = dyn_cast<Instruction>(NewVal);
-    if (LatchInst && L->contains(LatchInst))
-      NewVal = LVMap[LatchInst];
+  if (PeelLast) {
+    // Now adjust users of the original exit values by replacing them with the
+    // exit value from the peeled iteration.
+    for (const auto &[P, E] : ExitValues)
+      P->replaceAllUsesWith(VMap.lookup(E));
+    formLCSSA(*L, DT, LI, SE);
+  } else {
+    // Now adjust the phi nodes in the loop header to get their initial values
+    // from the last peeled-off iteration instead of the preheader.
+    for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+      PHINode *PHI = cast<PHINode>(I);
+      Value *NewVal = PHI->getIncomingValueForBlock(Latch);
+      Instruction *LatchInst = dyn_cast<Instruction>(NewVal);
+      if (LatchInst && L->contains(LatchInst))
+        NewVal = LVMap[LatchInst];
 
-    PHI->setIncomingValueForBlock(NewPreHeader, NewVal);
+      PHI->setIncomingValueForBlock(NewPreHeader, NewVal);
+    }
   }
 
   for (const auto &[Term, Info] : Weights) {
@@ -1090,6 +1233,7 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
   simplifyLoop(L, &DT, LI, SE, AC, nullptr, PreserveLCSSA);
 
   NumPeeled++;
+  NumPeeledEnd += PeelLast;
 
   return true;
 }
diff --git a/llvm/test/Transforms/LoopUnroll/peel-last-iteration.ll b/llvm/test/Transforms/LoopUnroll/peel-last-iteration.ll
index 2d024bd83e5ce..6afdcd39c3afe 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-last-iteration.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-last-iteration.ll
@@ -6,16 +6,28 @@ define i64 @peel_single_block_loop_iv_step_1() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @foo(i32 20)
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC1:%.*]] = icmp ne i64 [[IV_NEXT1]], 63
+; CHECK-NEXT:    br i1 [[EC1]], label %[[LOOP]], label %[[EXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[EXIT_PEEL_BEGIN]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV1]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
+; CHECK:       [[LOOP_PEEL]]:
 ; CHECK-NEXT:    [[CMP18_NOT:%.*]] = icmp eq i64 [[IV]], 63
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP18_NOT]], i32 10, i32 20
 ; CHECK-NEXT:    call void @foo(i32 [[COND]])
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 64
-; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT_PEEL_NEXT:.*]], label %[[EXIT_PEEL_NEXT]]
+; CHECK:       [[EXIT_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOP_PEEL_NEXT:.*]]
+; CHECK:       [[LOOP_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i64 [[IV_LCSSA]]
+; CHECK-NEXT:    ret i64 [[IV]]
 ;
 entry:
   br label %loop
@@ -140,22 +152,6 @@ define i64 @peel_single_block_loop_iv_step_1_nested_loop() {
 ; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[OUTER_HEADER]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV]], 63
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 10, i32 20
-; CHECK-NEXT:    call void @foo(i32 [[COND]])
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 64
-; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[OUTER_LATCH]]
-; CHECK:       [[OUTER_LATCH]]:
-; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ]
-; CHECK-NEXT:    call void @foo(i32 1)
-; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add i64 [[OUTER_IV]], 1
-; CHECK-NEXT:    [[OUTER_EC:%.*]] = icmp ne i64 [[OUTER_IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[OUTER_EC]], label %[[EXIT:.*]], label %[[OUTER_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[IV_LCSSA_LCSSA:%.*]] = phi i64 [ [[IV_LCSSA]], %[[OUTER_LATCH]] ]
-; CHECK-NEXT:    ret i64 [[IV_LCSSA_LCSSA]]
 ;
 entry:
   br label %outer.header
@@ -188,22 +184,6 @@ define i64 @peel_multi_block_loop_iv_step_1() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV]], 63
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 10, i32 20
-; CHECK-NEXT:    call void @foo(i32 [[COND]])
-; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
-; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LATCH]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    call void @foo(i32 [[COND]])
-; CHECK-NEXT:    br label %[[LATCH]]
-; CHECK:       [[LATCH]]:
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 64
-; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LATCH]] ]
-; CHECK-NEXT:    ret i64 [[IV_LCSSA]]
 ;
 entry:
   br label %loop
@@ -443,25 +423,45 @@ define i32 @peel_loop_with_branch_and_phi_uses(ptr %x, i1 %c) {
 ; CHECK:       [[LOOP_HEADER_PREHEADER]]:
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[LOOP_HEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[ADD:%.*]], %[[LOOP_LATCH]] ], [ 0, %[[LOOP_HEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[IV]], 99
-; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ [[IV_NEXT1:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[RED1:%.*]] = phi i32 [ [[ADD1:%.*]], %[[LOOP_LATCH]] ], [ 0, %[[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br i1 false, label %[[IF_THEN:.*]], label %[[LOOP_LATCH]]
 ; CHECK:       [[IF_THEN]]:
 ; CHECK-NEXT:    tail call void @foo(i32 10)
 ; CHECK-NEXT:    br label %[[LOOP_LATCH]]
 ; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[GEP_X1:%.*]] = getelementptr inbounds nuw i32, ptr [[X]], i32 [[IV1]]
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[GEP_X1]], align 4
+; CHECK-NEXT:    [[ADD1]] = add nsw i32 [[L1]], [[RED1]]
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i32 [[IV1]], 1
+; CHECK-NEXT:    [[EC1:%.*]] = icmp ne i32 [[IV_NEXT1]], 99
+; CHECK-NEXT:    br i1 [[EC1]], label %[[LOOP_HEADER]], label %[[LOOPEXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[LOOPEXIT_PEEL_BEGIN]]:
+; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[ADD1]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT1]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD1]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER_PEEL:.*]]
+; CHECK:       [[LOOP_HEADER_PEEL]]:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[IV]], 99
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN_PEEL:.*]], label %[[LOOP_LATCH_PEEL:.*]]
+; CHECK:       [[IF_THEN_PEEL]]:
+; CHECK-NEXT:    tail call void @foo(i32 10)
+; CHECK-NEXT:    br label %[[LOOP_LATCH_PEEL]]
+; CHECK:       [[LOOP_LATCH_PEEL]]:
 ; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds nuw i32, ptr [[X]], i32 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_X]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[L]], [[RED]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[L]], [[RED]]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_HEADER]], label %[[LOOPEXIT:.*]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOPEXIT_PEEL_NEXT:.*]], label %[[LOOPEXIT_PEEL_NEXT]]
+; CHECK:       [[LOOPEXIT_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER_PEEL_NEXT:.*]]
+; CHECK:       [[LOOP_HEADER_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOPEXIT:.*]]
 ; CHECK:       [[LOOPEXIT]]:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[LOOP_LATCH]] ]
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD_LCSSA]], %[[LOOPEXIT]] ]
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD]], %[[LOOPEXIT]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ;
 entry:
@@ -552,4 +552,3 @@ exit:
 
 declare void @foo(i32)
 declare i1 @cond()
-

>From 79fb447fec6f4bf141b7cb3798f4918e56963c19 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 12 May 2025 22:22:30 +0100
Subject: [PATCH 2/5] !fixup address latest comments, thanks

---
 llvm/include/llvm/Transforms/Utils/LoopPeel.h | 10 +++--
 llvm/lib/Transforms/Utils/LoopPeel.cpp        | 42 +++++++++++--------
 .../LoopUnroll/peel-last-iteration.ll         | 36 +++++++++++++++-
 3 files changed, 65 insertions(+), 23 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/LoopPeel.h b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
index f7babaf036768..dd59a9c766e45 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopPeel.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
@@ -21,11 +21,15 @@ namespace llvm {
 
 bool canPeel(const Loop *L);
 
+/// Returns true if the last iteration of \p L can be peeled off. It makes sure
+/// the loop exit condition can be adjusted when peeling and that the loop
+/// executes at least 2 iterations.
+bool canPeelLastIteration(const Loop &L, ScalarEvolution &SE);
+
 /// VMap is the value-map that maps instructions from the original loop to
 /// instructions in the last peeled-off iteration. If \p PeelLast is true, peel
-/// off the last \p PeelCount iterations from \p L. In that case, the caller has
-/// to make sure that the exit condition can be adjusted when peeling and that
-/// the loop executes at least 2 iterations.
+/// off the last \p PeelCount iterations from \p L (canPeelLastIteration must be
+/// true for \p L), otherwise peel off the first \p PeelCount iterations.
 bool peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
               ScalarEvolution *SE, DominatorTree &DT, AssumptionCache *AC,
               bool PreserveLCSSA, ValueToValueMapTy &VMap);
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index f01c3948b87ed..d41d6092c6dfe 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -326,13 +326,7 @@ static unsigned peelToTurnInvariantLoadsDerefencebale(Loop &L,
   return 0;
 }
 
-/// Returns true if the last iteration can be peeled off and the condition (Pred
-/// LeftAR, RightSCEV) is known at the last iteration and the inverse condition
-/// is known at the second-to-last. This function also has to make sure the loop
-/// exit condition can be adjusted when peeling and that the loop executes at
-/// least 2 iterations.
-static bool canPeelLastIteration(Loop &L, const SCEVAddRecExpr *LeftAR,
-                                 const SCEV *RightSCEV, ScalarEvolution &SE) {
+bool llvm::canPeelLastIteration(const Loop &L, ScalarEvolution &SE) {
   const SCEV *BTC = SE.getBackedgeTakenCount(&L);
   Value *Inc;
   CmpPredicate Pred;
@@ -351,16 +345,27 @@ static bool canPeelLastIteration(Loop &L, const SCEVAddRecExpr *LeftAR,
   // * the exit condition must be a NE/EQ compare of an induction with step
   // of 1.
   BasicBlock *Latch = L.getLoopLatch();
-  if (Latch != L.getExitingBlock() ||
-      !match(Latch->getTerminator(),
-             m_Br(m_ICmp(Pred, m_Value(Inc), m_Value()), m_BasicBlock(Succ1),
-                  m_BasicBlock(Succ2))) ||
-      !((Pred == CmpInst::ICMP_EQ && Succ2 == L.getHeader()) ||
-        (Pred == CmpInst::ICMP_NE && Succ1 == L.getHeader())) ||
-      !isa<SCEVAddRecExpr>(SE.getSCEV(Inc)) ||
-      !cast<SCEVAddRecExpr>(SE.getSCEV(Inc))->getStepRecurrence(SE)->isOne())
+  return Latch == L.getExitingBlock() &&
+         match(Latch->getTerminator(),
+               m_Br(m_ICmp(Pred, m_Value(Inc), m_Value()), m_BasicBlock(Succ1),
+                    m_BasicBlock(Succ2))) &&
+         ((Pred == CmpInst::ICMP_EQ && Succ2 == L.getHeader()) ||
+          (Pred == CmpInst::ICMP_NE && Succ1 == L.getHeader())) &&
+         isa<SCEVAddRecExpr>(SE.getSCEV(Inc)) &&
+         cast<SCEVAddRecExpr>(SE.getSCEV(Inc))->getStepRecurrence(SE)->isOne();
+}
+
+/// Returns true if the last iteration can be peeled off and the condition (Pred
+/// LeftAR, RightSCEV) is known at the last iteration and the inverse condition
+/// is known at the second-to-last.
+static bool shouldPeelLastIteration(Loop &L, CmpPredicate Pred,
+                                    const SCEVAddRecExpr *LeftAR,
+                                    const SCEV *RightSCEV,
+                                    ScalarEvolution &SE) {
+  if (!canPeelLastIteration(L, SE))
     return false;
 
+  const SCEV *BTC = SE.getBackedgeTakenCount(&L);
   const SCEV *ValAtLastIter =
       SE.applyLoopGuards(LeftAR->evaluateAtIteration(BTC, SE), &L);
   const SCEV *ValAtSecondToLastIter = LeftAR->evaluateAtIteration(
@@ -470,7 +475,7 @@ static int countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
     const SCEV *Step = LeftAR->getStepRecurrence(SE);
     if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, RightSCEV, Step,
                                    Pred)) {
-      if (canPeelLastIteration(L, LeftAR, RightSCEV, SE))
+      if (shouldPeelLastIteration(L, Pred, LeftAR, RightSCEV, SE))
         DesiredPeelCount = -1;
       return;
     }
@@ -987,8 +992,9 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
                     bool PreserveLCSSA, ValueToValueMapTy &LVMap) {
   assert(PeelCount > 0 && "Attempt to peel out zero iterations?");
   assert(canPeel(L) && "Attempt to peel a loop which is not peelable?");
-  assert((!PeelLast || PeelCount == 1) &&
-         "can only peel off a single iteration from the end for now");
+  assert((!PeelLast || (canPeelLastIteration(*L, *SE) && PeelCount == 1)) &&
+         "when peeling the last iteration, the loop must be supported and can "
+         "only peel a single iteration");
 
   LoopBlocksDFS LoopBlocks(L);
   LoopBlocks.perform(LI);
diff --git a/llvm/test/Transforms/LoopUnroll/peel-last-iteration.ll b/llvm/test/Transforms/LoopUnroll/peel-last-iteration.ll
index 6afdcd39c3afe..981b3fa142d83 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-last-iteration.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-last-iteration.ll
@@ -85,6 +85,7 @@ define i64 @peel_single_block_loop_iv_step_1_eq_pred() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
+<<<<<<< HEAD
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV]], 63
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 10, i32 20
@@ -93,7 +94,38 @@ define i64 @peel_single_block_loop_iv_step_1_eq_pred() {
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 64
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
+||||||| parent of 4da8bc5afc5b (!fixup address latest comments, thanks)
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[CMP18_NOT:%.*]] = icmp eq i64 [[IV]], 63
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP18_NOT]], i32 10, i32 20
+; CHECK-NEXT:    call void @foo(i32 [[COND]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 64
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+=======
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @foo(i32 20)
+; CHECK-NEXT:    [[IV]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[CMP18_NOT:%.*]] = icmp eq i64 [[IV]], 63
+; CHECK-NEXT:    br i1 [[CMP18_NOT]], label %[[EXIT_PEEL_BEGIN:.*]], label %[[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       [[EXIT_PEEL_BEGIN]]:
+>>>>>>> 4da8bc5afc5b (!fixup address latest comments, thanks)
 ; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_LCSSA1:%.*]] = phi i64 [ [[IV1]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
+; CHECK:       [[LOOP_PEEL]]:
+; CHECK-NEXT:    [[CMP_PEEL:%.*]] = icmp eq i64 [[IV_LCSSA]], 63
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP_PEEL]], i32 10, i32 20
+; CHECK-NEXT:    call void @foo(i32 [[COND]])
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add i64 [[IV_LCSSA]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 64
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT_PEEL_NEXT:.*]], label %[[EXIT_PEEL_NEXT]]
+; CHECK:       [[EXIT_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOP_PEEL_NEXT:.*]]
+; CHECK:       [[LOOP_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret i64 [[IV_LCSSA]]
 ;
 entry:
@@ -152,7 +184,7 @@ define i64 @peel_single_block_loop_iv_step_1_nested_loop() {
 ; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-;
+<<<;
 entry:
   br label %outer.header
 
@@ -435,7 +467,7 @@ define i32 @peel_loop_with_branch_and_phi_uses(ptr %x, i1 %c) {
 ; CHECK-NEXT:    [[ADD1]] = add nsw i32 [[L1]], [[RED1]]
 ; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i32 [[IV1]], 1
 ; CHECK-NEXT:    [[EC1:%.*]] = icmp ne i32 [[IV_NEXT1]], 99
-; CHECK-NEXT:    br i1 [[EC1]], label %[[LOOP_HEADER]], label %[[LOOPEXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC1]], label %[[LOOP_HEADER]], label %[[LOOPEXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[LOOPEXIT_PEEL_BEGIN]]:
 ; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[ADD1]], %[[LOOP_LATCH]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT1]], %[[LOOP_LATCH]] ]

>From 9e3ed56d8523652efd2626e295b28e71b988c0d1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 13 May 2025 13:47:25 +0100
Subject: [PATCH 3/5] !fixup update to return pair of peel counts

---
 llvm/lib/Transforms/Utils/LoopPeel.cpp | 58 ++++++++++++++++----------
 1 file changed, 37 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index d41d6092c6dfe..db6beff69b406 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -376,21 +376,22 @@ static bool shouldPeelLastIteration(Loop &L, CmpPredicate Pred,
          SE.isKnownPredicate(Pred, ValAtSecondToLastIter, RightSCEV);
 }
 
-// Return the number of iterations to peel off that make conditions in the
-// body true/false. Positive return values indicate the iterations to peel of
-// from the front and negative return values indicate the number of iterations
-// from the back after removing the sign. For example, if we peel 2 iterations
-// off the loop below, the condition i < 2 can be evaluated at compile time.
+// Return the number of iterations to peel off from the beginning and end of the
+// loop respectively, that make conditions in the body true/false. For example,
+// if we peel 2 iterations off the loop below, the condition i < 2 can be
+// evaluated at compile time.
+//
 //  for (i = 0; i < n; i++)
 //    if (i < 2)
 //      ..
 //    else
 //      ..
 //   }
-static int countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
-                                    ScalarEvolution &SE) {
+static std::pair<unsigned, unsigned>
+countToEliminateCompares(Loop &L, unsigned MaxPeelCount, ScalarEvolution &SE) {
   assert(L.isLoopSimplifyForm() && "Loop needs to be in loop simplify form");
-  int DesiredPeelCount = 0;
+  unsigned DesiredPeelCount = 0;
+  unsigned DesiredPeelCountLast = 0;
 
   // Do not peel the entire loop.
   const SCEV *BE = SE.getConstantMaxBackedgeTakenCount(&L);
@@ -402,9 +403,9 @@ static int countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
   // return true if inversed condition become known before reaching the
   // MaxPeelCount limit.
   auto PeelWhilePredicateIsKnown =
-      [&](int &PeelCount, const SCEV *&IterVal, const SCEV *BoundSCEV,
+      [&](unsigned &PeelCount, const SCEV *&IterVal, const SCEV *BoundSCEV,
           const SCEV *Step, ICmpInst::Predicate Pred) {
-        while (unsigned(std::abs(PeelCount)) < MaxPeelCount &&
+        while (PeelCount < MaxPeelCount &&
                SE.isKnownPredicate(Pred, IterVal, BoundSCEV)) {
           IterVal = SE.getAddExpr(IterVal, Step);
           ++PeelCount;
@@ -461,7 +462,7 @@ static int countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
 
     // Check if extending the current DesiredPeelCount lets us evaluate Pred
     // or !Pred in the loop body statically.
-    int NewPeelCount = DesiredPeelCount;
+    unsigned NewPeelCount = DesiredPeelCount;
 
     const SCEV *IterVal = LeftAR->evaluateAtIteration(
         SE.getConstant(LeftSCEV->getType(), NewPeelCount), SE);
@@ -476,7 +477,7 @@ static int countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
     if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, RightSCEV, Step,
                                    Pred)) {
       if (shouldPeelLastIteration(L, Pred, LeftAR, RightSCEV, SE))
-        DesiredPeelCount = -1;
+        DesiredPeelCountLast = 1;
       return;
     }
 
@@ -489,12 +490,13 @@ static int countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
                              RightSCEV) &&
         !SE.isKnownPredicate(Pred, IterVal, RightSCEV) &&
         SE.isKnownPredicate(Pred, NextIterVal, RightSCEV)) {
-      if (unsigned(std::abs(NewPeelCount)) >= MaxPeelCount)
+      if (NewPeelCount >= MaxPeelCount)
         return; // Need to peel one more iteration, but can't. Give up.
       ++NewPeelCount; // Great!
     }
 
     DesiredPeelCount = std::max(DesiredPeelCount, NewPeelCount);
+    DesiredPeelCountLast = std::max(DesiredPeelCountLast, NewPeelCount);
   };
 
   auto ComputePeelCountMinMax = [&](MinMaxIntrinsic *MinMax) {
@@ -528,7 +530,7 @@ static int countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
     // Check that AddRec is not wrapping.
     if (!(IsSigned ? AddRec->hasNoSignedWrap() : AddRec->hasNoUnsignedWrap()))
       return;
-    int NewPeelCount = DesiredPeelCount;
+    unsigned NewPeelCount = DesiredPeelCount;
     const SCEV *IterVal = AddRec->evaluateAtIteration(
         SE.getConstant(AddRec->getType(), NewPeelCount), SE);
     if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, BoundSCEV, Step,
@@ -556,7 +558,7 @@ static int countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
     ComputePeelCount(BI->getCondition(), 0);
   }
 
-  return DesiredPeelCount;
+  return {DesiredPeelCount, DesiredPeelCountLast};
 }
 
 /// This "heuristic" exactly matches implicit behavior which used to exist
@@ -649,9 +651,9 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
       DesiredPeelCount = std::max(DesiredPeelCount, *NumPeels);
   }
 
-  int CountToEliminateCmps = countToEliminateCompares(*L, MaxPeelCount, SE);
-  DesiredPeelCount =
-      std::max(DesiredPeelCount, unsigned(std::abs(CountToEliminateCmps)));
+  const auto &[CountToEliminateCmps, CountToEliminateCmpsLast] =
+      countToEliminateCompares(*L, MaxPeelCount, SE);
+  DesiredPeelCount = std::max(DesiredPeelCount, CountToEliminateCmps);
 
   if (DesiredPeelCount == 0)
     DesiredPeelCount = peelToTurnInvariantLoadsDerefencebale(*L, DT, AC);
@@ -666,9 +668,23 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
                         << " some Phis into invariants.\n");
       PP.PeelCount = DesiredPeelCount;
       PP.PeelProfiledIterations = false;
-      PP.PeelLast =
-          DesiredPeelCount == unsigned(std::abs(CountToEliminateCmps)) &&
-          CountToEliminateCmps < 0;
+      PP.PeelLast = false;
+      return;
+    }
+  }
+
+  if (CountToEliminateCmpsLast > 0) {
+    unsigned DesiredPeelCountLast =
+        std::min(CountToEliminateCmpsLast, MaxPeelCount);
+    // Consider max peel count limitation.
+    assert(DesiredPeelCountLast > 0 && "Wrong loop size estimation?");
+    if (DesiredPeelCountLast + AlreadyPeeled <= UnrollPeelMaxCount) {
+      LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
+                        << " iteration(s) to turn"
+                        << " some Phis into invariants.\n");
+      PP.PeelCount = DesiredPeelCountLast;
+      PP.PeelProfiledIterations = false;
+      PP.PeelLast = true;
       return;
     }
   }

>From 1e4674e82ae8e9663570fe321e52585ca891af23 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 14 May 2025 14:57:14 +0100
Subject: [PATCH 4/5] !fixup remove applyLoopGuards not needed in this version.

---
 llvm/lib/Transforms/Utils/LoopPeel.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index db6beff69b406..719db6df4556e 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -366,13 +366,12 @@ static bool shouldPeelLastIteration(Loop &L, CmpPredicate Pred,
     return false;
 
   const SCEV *BTC = SE.getBackedgeTakenCount(&L);
-  const SCEV *ValAtLastIter =
-      SE.applyLoopGuards(LeftAR->evaluateAtIteration(BTC, SE), &L);
+  const SCEV *ValAtLastIter = LeftAR->evaluateAtIteration(BTC, SE);
   const SCEV *ValAtSecondToLastIter = LeftAR->evaluateAtIteration(
       SE.getMinusSCEV(BTC, SE.getOne(BTC->getType())), SE);
 
   return SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), ValAtLastIter,
-                             SE.applyLoopGuards(RightSCEV, &L)) &&
+                             RightSCEV) &&
          SE.isKnownPredicate(Pred, ValAtSecondToLastIter, RightSCEV);
 }
 

>From cffd7b63c7ca39914104da33f7d32dcdc004b236 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 15 May 2025 12:44:11 +0100
Subject: [PATCH 5/5] !fixup adjust BTC check, match sure latch non-null,
 thanks

---
 llvm/lib/Transforms/Utils/LoopPeel.cpp        |   4 +-
 .../LoopUnroll/peel-last-iteration.ll         | 136 +++++++++++++-----
 2 files changed, 101 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 719db6df4556e..f15252b5f77e1 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -336,7 +336,7 @@ bool llvm::canPeelLastIteration(const Loop &L, ScalarEvolution &SE) {
   // iteration executes.
   // TODO: Add checks during codegen.
   if (isa<SCEVCouldNotCompute>(BTC) ||
-      !SE.isKnownPredicate(CmpInst::ICMP_UGT, BTC, SE.getOne(BTC->getType())))
+      !SE.isKnownPredicate(CmpInst::ICMP_UGT, BTC, SE.getZero(BTC->getType())))
     return false;
 
   // Check if the exit condition of the loop can be adjusted by the peeling
@@ -345,7 +345,7 @@ bool llvm::canPeelLastIteration(const Loop &L, ScalarEvolution &SE) {
   // * the exit condition must be a NE/EQ compare of an induction with step
   // of 1.
   BasicBlock *Latch = L.getLoopLatch();
-  return Latch == L.getExitingBlock() &&
+  return Latch && Latch == L.getExitingBlock() &&
          match(Latch->getTerminator(),
                m_Br(m_ICmp(Pred, m_Value(Inc), m_Value()), m_BasicBlock(Succ1),
                     m_BasicBlock(Succ2))) &&
diff --git a/llvm/test/Transforms/LoopUnroll/peel-last-iteration.ll b/llvm/test/Transforms/LoopUnroll/peel-last-iteration.ll
index 981b3fa142d83..78a13b83ec8d1 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-last-iteration.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-last-iteration.ll
@@ -85,40 +85,20 @@ define i64 @peel_single_block_loop_iv_step_1_eq_pred() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-<<<<<<< HEAD
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV]], 63
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 10, i32 20
-; CHECK-NEXT:    call void @foo(i32 [[COND]])
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 64
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK:       [[EXIT]]:
-||||||| parent of 4da8bc5afc5b (!fixup address latest comments, thanks)
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[CMP18_NOT:%.*]] = icmp eq i64 [[IV]], 63
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP18_NOT]], i32 10, i32 20
-; CHECK-NEXT:    call void @foo(i32 [[COND]])
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 64
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK:       [[EXIT]]:
-=======
-; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_LCSSA:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    call void @foo(i32 20)
-; CHECK-NEXT:    [[IV]] = add nuw nsw i64 [[IV1]], 1
-; CHECK-NEXT:    [[CMP18_NOT:%.*]] = icmp eq i64 [[IV]], 63
-; CHECK-NEXT:    br i1 [[CMP18_NOT]], label %[[EXIT_PEEL_BEGIN:.*]], label %[[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    [[IV_LCSSA]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP_PEEL:%.*]] = icmp eq i64 [[IV_LCSSA]], 63
+; CHECK-NEXT:    br i1 [[CMP_PEEL]], label %[[EXIT_PEEL_BEGIN:.*]], label %[[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       [[EXIT_PEEL_BEGIN]]:
->>>>>>> 4da8bc5afc5b (!fixup address latest comments, thanks)
-; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_LCSSA1:%.*]] = phi i64 [ [[IV1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_LCSSA]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_LCSSA1:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ]
 ; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
 ; CHECK:       [[LOOP_PEEL]]:
-; CHECK-NEXT:    [[CMP_PEEL:%.*]] = icmp eq i64 [[IV_LCSSA]], 63
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP_PEEL]], i32 10, i32 20
+; CHECK-NEXT:    [[CMP_PEEL1:%.*]] = icmp eq i64 [[IV_NEXT_LCSSA]], 63
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP_PEEL1]], i32 10, i32 20
 ; CHECK-NEXT:    call void @foo(i32 [[COND]])
-; CHECK-NEXT:    [[IV_NEXT:%.*]] = add i64 [[IV_LCSSA]], 1
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add i64 [[IV_NEXT_LCSSA]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 64
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT_PEEL_NEXT:.*]], label %[[EXIT_PEEL_NEXT]]
 ; CHECK:       [[EXIT_PEEL_NEXT]]:
@@ -126,7 +106,7 @@ define i64 @peel_single_block_loop_iv_step_1_eq_pred() {
 ; CHECK:       [[LOOP_PEEL_NEXT]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret i64 [[IV_LCSSA]]
+; CHECK-NEXT:    ret i64 [[IV_NEXT_LCSSA]]
 ;
 entry:
   br label %loop
@@ -184,7 +164,35 @@ define i64 @peel_single_block_loop_iv_step_1_nested_loop() {
 ; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-<<<;
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[OUTER_HEADER]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @foo(i32 20)
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 63
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[OUTER_LATCH_PEEL_BEGIN:.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[OUTER_LATCH_PEEL_BEGIN]]:
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
+; CHECK:       [[LOOP_PEEL]]:
+; CHECK-NEXT:    [[CMP_PEEL:%.*]] = icmp eq i64 [[IV_NEXT_LCSSA]], 63
+; CHECK-NEXT:    [[COND_PEEL:%.*]] = select i1 [[CMP_PEEL]], i32 10, i32 20
+; CHECK-NEXT:    call void @foo(i32 [[COND_PEEL]])
+; CHECK-NEXT:    [[IV_NEXT_PEEL:%.*]] = add i64 [[IV_NEXT_LCSSA]], 1
+; CHECK-NEXT:    [[EC_PEEL:%.*]] = icmp ne i64 [[IV_NEXT_PEEL]], 64
+; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[OUTER_LATCH_PEEL_NEXT:.*]], label %[[OUTER_LATCH_PEEL_NEXT]]
+; CHECK:       [[OUTER_LATCH_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOP_PEEL_NEXT:.*]]
+; CHECK:       [[LOOP_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[OUTER_LATCH]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    call void @foo(i32 1)
+; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add i64 [[OUTER_IV]], 1
+; CHECK-NEXT:    [[OUTER_EC:%.*]] = icmp ne i64 [[OUTER_IV_NEXT]], 100
+; CHECK-NEXT:    br i1 [[OUTER_EC]], label %[[EXIT:.*]], label %[[OUTER_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[IV_LCSSA_LCSSA:%.*]] = phi i64 [ [[IV_NEXT_LCSSA]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    ret i64 [[IV_LCSSA_LCSSA]]
+;
 entry:
   br label %outer.header
 
@@ -216,6 +224,40 @@ define i64 @peel_multi_block_loop_iv_step_1() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; CHECK-NEXT:    call void @foo(i32 20)
+; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LATCH]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    call void @foo(i32 20)
+; CHECK-NEXT:    br label %[[LATCH]]
+; CHECK:       [[LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 63
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[EXIT_PEEL_BEGIN]]:
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_NEXT]], %[[LATCH]] ]
+; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LATCH]] ]
+; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
+; CHECK:       [[LOOP_PEEL]]:
+; CHECK-NEXT:    [[CMP_PEEL:%.*]] = icmp eq i64 [[IV_NEXT_LCSSA]], 63
+; CHECK-NEXT:    [[COND_PEEL:%.*]] = select i1 [[CMP_PEEL]], i32 10, i32 20
+; CHECK-NEXT:    call void @foo(i32 [[COND_PEEL]])
+; CHECK-NEXT:    [[C_PEEL:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C_PEEL]], label %[[THEN_PEEL:.*]], label %[[LATCH_PEEL:.*]]
+; CHECK:       [[THEN_PEEL]]:
+; CHECK-NEXT:    call void @foo(i32 [[COND_PEEL]])
+; CHECK-NEXT:    br label %[[LATCH_PEEL]]
+; CHECK:       [[LATCH_PEEL]]:
+; CHECK-NEXT:    [[IV_NEXT_PEEL:%.*]] = add i64 [[IV_NEXT_LCSSA]], 1
+; CHECK-NEXT:    [[EC_PEEL:%.*]] = icmp ne i64 [[IV_NEXT_PEEL]], 64
+; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[EXIT_PEEL_NEXT:.*]], label %[[EXIT_PEEL_NEXT]]
+; CHECK:       [[EXIT_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOP_PEEL_NEXT:.*]]
+; CHECK:       [[LOOP_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i64 [[IV_NEXT_LCSSA]]
 ;
 entry:
   br label %loop
@@ -317,16 +359,27 @@ define i64 @peel_single_block_loop_iv_step_1_btc_1() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @foo(i32 20)
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    br i1 false, label %[[LOOP]], label %[[EXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT_PEEL_BEGIN]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV1]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
+; CHECK:       [[LOOP_PEEL]]:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 10, i32 20
 ; CHECK-NEXT:    call void @foo(i32 [[COND]])
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 2
-; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT_PEEL_NEXT:.*]], label %[[EXIT_PEEL_NEXT]]
+; CHECK:       [[EXIT_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOP_PEEL_NEXT:.*]]
+; CHECK:       [[LOOP_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i64 [[IV_LCSSA]]
+; CHECK-NEXT:    ret i64 [[IV]]
 ;
 entry:
   br label %loop
@@ -467,7 +520,7 @@ define i32 @peel_loop_with_branch_and_phi_uses(ptr %x, i1 %c) {
 ; CHECK-NEXT:    [[ADD1]] = add nsw i32 [[L1]], [[RED1]]
 ; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i32 [[IV1]], 1
 ; CHECK-NEXT:    [[EC1:%.*]] = icmp ne i32 [[IV_NEXT1]], 99
-; CHECK-NEXT:    br i1 [[EC1]], label %[[LOOP_HEADER]], label %[[LOOPEXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC1]], label %[[LOOP_HEADER]], label %[[LOOPEXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[LOOPEXIT_PEEL_BEGIN]]:
 ; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[ADD1]], %[[LOOP_LATCH]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT1]], %[[LOOP_LATCH]] ]
@@ -584,3 +637,12 @@ exit:
 
 declare void @foo(i32)
 declare i1 @cond()
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.peeled.count", i32 1}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]}
+;.



More information about the llvm-commits mailing list