[llvm] [LoopPeel] Peel last iteration to enable load widening (PR #173420)

Tue Dec 23 13:36:33 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Guy David (guy-david)

<details>
<summary>Changes</summary>

In loops that contain multiple consecutive small loads (e.g., 3 bytes loading i8's), peeling the last iteration makes it safe to read beyond the accessed region, enabling the use of a wider load (e.g., i32) for all other N-1 iterations.

Patterns such as:
```
  %a = load i8, ptr %p
  %b = load i8, ptr %p+1
  %c = load i8, ptr %p+2
  ...
  %p.next = getelementptr i8, ptr %p, 3
```

Can be transformed to:
```
  %wide = load i32, ptr %p  ; Read 4 bytes
  %a = trunc
  %b = lshr + trunc
  %c = lshr + trunc
  ...
```

This acts as a fallback strategy when vectorization fails and has significant performance uplifts, most notably in image processing where processing an RGB buffer is common.

---

Patch is 50.22 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/173420.diff


6 Files Affected:

- (modified) llvm/include/llvm/Transforms/Utils/LoopPeel.h (+7-1) 
- (modified) llvm/include/llvm/Transforms/Utils/UnrollLoop.h (+10-8) 
- (modified) llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp (+24-10) 
- (modified) llvm/lib/Transforms/Utils/LoopPeel.cpp (+230-1) 
- (added) llvm/test/Transforms/LoopUnroll/peel-last-iteration-load-widening-be.ll (+104) 
- (added) llvm/test/Transforms/LoopUnroll/peel-last-iteration-load-widening.ll (+616) 


``````````diff

diff --git a/llvm/include/llvm/Transforms/Utils/LoopPeel.h b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
index 49dbc9aa1f2a9..9daf4789082cc 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopPeel.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
@@ -46,7 +46,13 @@ void computePeelCount(Loop *L, unsigned LoopSize,
                       unsigned TripCount, DominatorTree &DT,
                       ScalarEvolution &SE, const TargetTransformInfo &TTI,
                       AssumptionCache *AC = nullptr,
-                      unsigned Threshold = UINT_MAX);
+                      unsigned Threshold = UINT_MAX,
+                      bool AllowLoadWideningPeel = true);
+
+/// Combine load instructions in a loop into a wider one, given that we peeled
+/// the last iteration and can assume the bytes are dereferenceable.
+bool widenLoadsAfterPeel(Loop &L, ScalarEvolution &SE, const DataLayout &DL,
+                         const TargetTransformInfo &TTI, DominatorTree &DT);
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index a3efc43c62dc3..073f8c26c664e 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -157,14 +157,16 @@ class UnrollCostEstimator {
                       unsigned CountOverwrite = 0) const;
 };
 
-LLVM_ABI bool computeUnrollCount(
-    Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
-    AssumptionCache *AC, ScalarEvolution &SE,
-    const SmallPtrSetImpl<const Value *> &EphValues,
-    OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount,
-    bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE,
-    TargetTransformInfo::UnrollingPreferences &UP,
-    TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound);
+LLVM_ABI bool
+computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT,
+                   LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE,
+                   const SmallPtrSetImpl<const Value *> &EphValues,
+                   OptimizationRemarkEmitter *ORE, unsigned TripCount,
+                   unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple,
+                   const UnrollCostEstimator &UCE,
+                   TargetTransformInfo::UnrollingPreferences &UP,
+                   TargetTransformInfo::PeelingPreferences &PP,
+                   bool &UseUpperBound, bool AllowLoadWideningPeel = true);
 
 LLVM_ABI std::optional<RecurrenceDescriptor>
 canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 802ae4e9c28e3..6ecc50d741ce2 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -921,14 +921,17 @@ shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
 // FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
 // many LoopUnroll-specific options. The shared functionality should be
 // refactored into it own function.
-bool llvm::computeUnrollCount(
-    Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
-    AssumptionCache *AC, ScalarEvolution &SE,
-    const SmallPtrSetImpl<const Value *> &EphValues,
-    OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount,
-    bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE,
-    TargetTransformInfo::UnrollingPreferences &UP,
-    TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
+bool llvm::computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
+                              DominatorTree &DT, LoopInfo *LI,
+                              AssumptionCache *AC, ScalarEvolution &SE,
+                              const SmallPtrSetImpl<const Value *> &EphValues,
+                              OptimizationRemarkEmitter *ORE,
+                              unsigned TripCount, unsigned MaxTripCount,
+                              bool MaxOrZero, unsigned TripMultiple,
+                              const UnrollCostEstimator &UCE,
+                              TargetTransformInfo::UnrollingPreferences &UP,
+                              TargetTransformInfo::PeelingPreferences &PP,
+                              bool &UseUpperBound, bool AllowLoadWideningPeel) {
 
   unsigned LoopSize = UCE.getRolledLoopSize();
 
@@ -1014,7 +1017,8 @@ bool llvm::computeUnrollCount(
   }
 
   // 5th priority is loop peeling.
-  computePeelCount(L, LoopSize, PP, TripCount, DT, SE, TTI, AC, UP.Threshold);
+  computePeelCount(L, LoopSize, PP, TripCount, DT, SE, TTI, AC, UP.Threshold,
+                   AllowLoadWideningPeel);
   if (PP.PeelCount) {
     UP.Runtime = false;
     UP.Count = 1;
@@ -1293,10 +1297,14 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
 
   // computeUnrollCount() decides whether it is beneficial to use upper bound to
   // fully unroll the loop.
+  // When OnlyFullUnroll is true, we're running before vectorization
+  // (LoopFullUnrollPass), so disable load widening peeling to avoid peeling
+  // loops that could have been vectorized instead.
   bool UseUpperBound = false;
   bool IsCountSetExplicitly = computeUnrollCount(
       L, TTI, DT, LI, &AC, SE, EphValues, &ORE, TripCount, MaxTripCount,
-      MaxOrZero, TripMultiple, UCE, UP, PP, UseUpperBound);
+      MaxOrZero, TripMultiple, UCE, UP, PP, UseUpperBound,
+      /*AllowLoadWideningPeel=*/!OnlyFullUnroll);
   if (!UP.Count)
     return LoopUnrollResult::Unmodified;
 
@@ -1316,6 +1324,12 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
     ValueToValueMapTy VMap;
     if (peelLoop(L, PP.PeelCount, PP.PeelLast, LI, &SE, DT, &AC, PreserveLCSSA,
                  VMap)) {
+      // Widen consecutive loads after last-iteration peeling
+      if (PP.PeelLast) {
+        const DataLayout &DL = L->getHeader()->getDataLayout();
+        widenLoadsAfterPeel(*L, SE, DL, TTI, DT);
+      }
+
       simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI, nullptr);
       // If the loop was peeled, we already "used up" the profile information
       // we had, so we don't want to unroll or peel again.
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 960ec9d4c7d6e..b86f34a41ab6b 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -86,6 +86,11 @@ static cl::opt<bool> EnablePeelingForIV(
     "enable-peeling-for-iv", cl::init(false), cl::Hidden,
     cl::desc("Enable peeling to convert Phi nodes into IVs"));
 
+static cl::opt<bool> EnablePeelForLoadWidening(
+    "enable-peel-for-load-widening", cl::init(true), cl::Hidden,
+    cl::desc(
+        "Enable peeling last iteration to enable consecutive load widening"));
+
 static const char *PeeledCountMetaData = "llvm.loop.peeled.count";
 
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
@@ -746,13 +751,148 @@ static bool violatesLegacyMultiExitLoopCheck(Loop *L) {
     });
 }
 
+namespace {
+// Represents a group of loads in a loop that can be combined into a wider one.
+struct LoadGroup {
+  // Base object being read.
+  Value *BasePtr;
+  // First load instruction in the program order.
+  LoadInst *FirstLoad;
+  // Pairs of (load instruction, offset from base).
+  SmallVector<std::pair<LoadInst *, APInt>, 4> Loads;
+  // An applicable wider integer type to load as.
+  Type *WideType;
+};
+
+// Helper to compute load group span and validate for widening.
+static std::optional<LoadGroup> tryFormLoadGroupForWidening(
+    Value *Base, SmallVectorImpl<std::pair<LoadInst *, APInt>> &Loads, Loop &L,
+    ScalarEvolution &SE, const DataLayout &DL, const TargetTransformInfo &TTI) {
+  // Find the span of the loaded data.
+  int64_t Left = INT64_MAX;
+  int64_t Right = INT64_MIN;
+  for (const auto &[Load, Offset] : Loads) {
+    Left = std::min(Left, Offset.getSExtValue());
+    Right = std::max(
+        Right, Offset.getSExtValue() +
+                   static_cast<int64_t>(DL.getTypeStoreSize(Load->getType())));
+  }
+  assert((Left < Right) && "Invalid load group span");
+  uint64_t TotalBytes = Right - Left;
+  uint64_t TotalBits = TotalBytes * 8;
+  // Powers of two are already natural for most targets.
+  if (isPowerOf2_64(TotalBits))
+    return std::nullopt;
+  Type *WideType =
+      DL.getSmallestLegalIntType(L.getHeader()->getContext(), TotalBits);
+  if (!WideType)
+    return std::nullopt;
+  unsigned WideBits = WideType->getIntegerBitWidth();
+  // Total size is already natural for the target.
+  if (WideBits == TotalBits)
+    return std::nullopt;
+  // Peeling doubles dereferenceable bytes, ensure wide type fits.
+  if (WideBits > TotalBits * 2)
+    return std::nullopt;
+  // Check alignment is unconstrained.
+  unsigned Fast = 0;
+  if (!TTI.allowsMisalignedMemoryAccesses(L.getHeader()->getContext(), WideBits,
+                                          DL.getDefaultGlobalsAddressSpace(),
+                                          Align(1), &Fast) &&
+      !Fast)
+    return std::nullopt;
+  // Validate pointer stride across iterations.
+  const SCEV *PtrSCEV = SE.getSCEV(Base);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrSCEV);
+  if (!AR || AR->getLoop() != &L)
+    return std::nullopt;
+  const SCEV *Step = AR->getStepRecurrence(SE);
+  auto *ConstStep = dyn_cast<SCEVConstant>(Step);
+  if (!ConstStep)
+    return std::nullopt;
+  int64_t StepVal = ConstStep->getValue()->getSExtValue();
+  if (StepVal != static_cast<int64_t>(TotalBytes))
+    return std::nullopt;
+
+  LoadInst *FirstLoad = Loads[0].first;
+  llvm::sort(Loads, [](const auto &A, const auto &B) {
+    return A.second.slt(B.second);
+  });
+  return LoadGroup{Base, FirstLoad, std::move(Loads), WideType};
+}
+
+// Find groups of consecutive loads in a basic block for peeling purposes
+static SmallVector<LoadGroup>
+findLoadGroupsForWidening(BasicBlock *BB, Loop &L, ScalarEvolution &SE,
+                          const DataLayout &DL,
+                          const TargetTransformInfo &TTI) {
+  SmallVector<LoadGroup> Groups;
+  // Mapping from base pointer to loads instructions and their offset from the
+  // base.
+  DenseMap<Value *, SmallVector<std::pair<LoadInst *, APInt>>> LoadsByBase;
+
+  auto ProcessCollectedLoads = [&]() {
+    for (auto &[Base, Loads] : LoadsByBase) {
+      if (Loads.size() <= 1)
+        continue;
+      if (auto Group = tryFormLoadGroupForWidening(Base, Loads, L, SE, DL, TTI))
+        Groups.emplace_back(*std::move(Group));
+    }
+    LoadsByBase.clear();
+  };
+
+  for (Instruction &I : *BB) {
+    if (auto *Load = dyn_cast<LoadInst>(&I)) {
+      if (Load->isVolatile() || Load->isAtomic() ||
+          !Load->getType()->isIntegerTy() ||
+          Load->getPointerAddressSpace() != DL.getDefaultGlobalsAddressSpace())
+        continue;
+      Value *Ptr = Load->getPointerOperand();
+      APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+      Value *ActualBase = Ptr->stripAndAccumulateConstantOffsets(
+          DL, Offset, /*AllowNonInbounds=*/false);
+      LoadsByBase[ActualBase].emplace_back(Load, Offset);
+    } else if (I.mayHaveSideEffects())
+      ProcessCollectedLoads();
+  }
+  ProcessCollectedLoads();
+  return Groups;
+}
+
+// Returns 1 if peeling the last iteration would enable widening load groups to
+// natural sizes. Returns 0 otherwise.
+static unsigned peelLastForLoadWidening(Loop &L, ScalarEvolution &SE,
+                                        const DataLayout &DL,
+                                        const TargetTransformInfo &TTI,
+                                        DominatorTree &DT) {
+  if (!EnablePeelForLoadWidening)
+    return 0;
+  if (!L.isInnermost())
+    return 0;
+  if (!canPeelLastIteration(L, SE))
+    return 0;
+  BasicBlock *Latch = L.getLoopLatch();
+  if (!Latch)
+    return 0;
+  for (BasicBlock *BB : L.blocks()) {
+    // Look for consecutive loads in blocks that execute every iteration.
+    if (!DT.dominates(BB, Latch))
+      continue;
+    auto Groups = findLoadGroupsForWidening(BB, L, SE, DL, TTI);
+    if (!Groups.empty())
+      return 1;
+  }
+  return 0;
+}
+} // anonymous namespace
 
 // Return the number of iterations we want to peel off.
 void llvm::computePeelCount(Loop *L, unsigned LoopSize,
                             TargetTransformInfo::PeelingPreferences &PP,
                             unsigned TripCount, DominatorTree &DT,
                             ScalarEvolution &SE, const TargetTransformInfo &TTI,
-                            AssumptionCache *AC, unsigned Threshold) {
+                            AssumptionCache *AC, unsigned Threshold,
+                            bool AllowLoadWideningPeel) {
   assert(LoopSize > 0 && "Zero loop size is not allowed!");
   // Save the PP.PeelCount value set by the target in
   // TTI.getPeelingPreferences or by the flag -unroll-peel-count.
@@ -852,6 +992,25 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
     }
   }
 
+  // Check for consecutive load widening opportunity.
+  // Skip this when running before vectorization (AllowLoadWideningPeel=false)
+  // to avoid peeling loops that could have been vectorized instead.
+  if (PP.PeelCount == 0 && AllowLoadWideningPeel) {
+    const DataLayout &DL = L->getHeader()->getDataLayout();
+    unsigned LoadWideningPeel = peelLastForLoadWidening(*L, SE, DL, TTI, DT);
+    if (LoadWideningPeel > 0) {
+      if (LoadWideningPeel + AlreadyPeeled <= UnrollPeelMaxCount) {
+        LLVM_DEBUG(
+            dbgs() << "Peel last " << LoadWideningPeel
+                   << " iteration(s) to enable consecutive load widening.\n");
+        PP.PeelCount = LoadWideningPeel;
+        PP.PeelProfiledIterations = false;
+        PP.PeelLast = true;
+        return;
+      }
+    }
+  }
+
   // Bail if we know the statically calculated trip count.
   // In this case we rather prefer partial unrolling.
   if (TripCount)
@@ -890,6 +1049,76 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   }
 }
 
+bool llvm::widenLoadsAfterPeel(Loop &L, ScalarEvolution &SE,
+                               const DataLayout &DL,
+                               const TargetTransformInfo &TTI,
+                               DominatorTree &DT) {
+  BasicBlock *Latch = L.getLoopLatch();
+  if (!Latch)
+    return false;
+
+  bool Changed = false;
+
+  for (BasicBlock *BB : L.blocks()) {
+    if (!DT.dominates(BB, Latch))
+      continue;
+    SmallVector<LoadGroup> Groups =
+        findLoadGroupsForWidening(BB, L, SE, DL, TTI);
+    for (const LoadGroup &Group : Groups) {
+      LoadInst *InsertPoint = Group.FirstLoad;
+      IRBuilder<> Builder(InsertPoint);
+      Value *BasePtr = Group.BasePtr;
+      int64_t FirstOffset = Group.Loads[0].second.getSExtValue();
+      // If the first load doesn't start at offset 0, we need to adjust.
+      if (FirstOffset != 0) {
+        Value *OrigPtr = Group.BasePtr;
+        BasePtr = Builder.CreatePtrAdd(
+            OrigPtr,
+            ConstantInt::get(
+                Builder.getIndexTy(DL, DL.getDefaultGlobalsAddressSpace()),
+                FirstOffset));
+      }
+      // Merge AA metadata from all loads.
+      AAMDNodes AATags = InsertPoint->getAAMetadata();
+      for (const auto &[Load, Offset] : Group.Loads) {
+        if (Load != InsertPoint)
+          AATags = AATags.concat(Load->getAAMetadata());
+      }
+      // Create the wider load.
+      LoadInst *WideLoad = Builder.CreateLoad(Group.WideType, BasePtr);
+      unsigned SizeInBits = WideLoad->getType()->getScalarSizeInBits();
+      if (AATags)
+        WideLoad->setAAMetadata(AATags);
+      // For each original load, extract the corresponding bytes.
+      for (const auto &[Load, Offset] : Group.Loads) {
+        unsigned LoadBytes = DL.getTypeStoreSize(Load->getType());
+        Value *Extracted = WideLoad;
+        unsigned BitOffset =
+            DL.isBigEndian()
+                ? SizeInBits -
+                      (Offset.getSExtValue() - FirstOffset + LoadBytes) * 8
+                : (Offset.getSExtValue() - FirstOffset) * 8;
+        if (BitOffset != 0)
+          Extracted = Builder.CreateLShr(
+              Extracted, ConstantInt::get(WideLoad->getType(), BitOffset));
+        unsigned TargetBits = Load->getType()->getScalarSizeInBits();
+        if (TargetBits < SizeInBits)
+          Extracted = Builder.CreateTrunc(Extracted, Load->getType());
+        Load->replaceAllUsesWith(Extracted);
+      }
+      // Delete the original loads.
+      for (auto &[Load, Offset] : Group.Loads)
+        Load->eraseFromParent();
+
+      LLVM_DEBUG(dbgs() << "Widened " << Group.Loads.size()
+                        << " loads into a single " << *WideLoad << "\n");
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
 /// Clones the body of the loop L, putting it between \p InsertTop and \p
 /// InsertBot.
 /// \param IterNumber The serial number of the iteration currently being
diff --git a/llvm/test/Transforms/LoopUnroll/peel-last-iteration-load-widening-be.ll b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-load-widening-be.ll
new file mode 100644
index 0000000000000..173130deae048
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-load-widening-be.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=aarch64_be -passes=loop-unroll -S %s | FileCheck %s
+
+; Test that loop peeling for load widening works correctly on big-endian targets.
+; The byte extraction shifts should be different from little-endian.
+
+target datalayout = "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; Test 3-byte consecutive loads on big-endian, should peel and use i32 load.
+; For big-endian with i32 load:
+;   byte 0 (lowest address) is in bits [31:24]
+;   byte 1 is in bits [23:16]
+;   byte 2 is in bits [15:8]
+;   byte 3 (unused) is in bits [7:0]
+define void @test_3_consecutive_loads_be(ptr %src, ptr %dst, i32 %n) {
+; CHECK-LABEL: define void @test_3_consecutive_loads_be(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[ENTRY_SPLIT:.*]], label %[[EXIT_PEEL_BEGIN:.*]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[P:%.*]] = phi ptr [ [[SRC]], %[[ENTRY_SPLIT]] ], [ [[P_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP2]], 24
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i8
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
+; CHECK-NEXT:    [[SUM1:%.*]] = add i8 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[SUM2:%.*]] = add i8 [[SUM1]], [[TMP8]]
+; CHECK-NEXT:    [[DST_I:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[I]]
+; CHECK-NEXT:    store i8 [[SUM2]], ptr [[DST_I]], align 1
+; CHECK-NEXT:    [[P_NEXT]] = getelementptr inbounds i8, ptr [[P]], i64 3
+; CHECK-NEXT:    [[I_NEXT]] = add nuw i32 [[I]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[N]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[I_NEXT]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT_PEEL_BEGIN_LOOPEXIT:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[EXIT_PEEL_BEGIN_LOOPEXIT]]:
+; CHECK-NEXT:    [[DOTPH:%.*]] = phi i32 [ [[I_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[DOTPH1:%.*]] = phi ptr [ [[P_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[EXIT_PEEL_BEGIN]]
+; CHECK:       [[EXIT_PEEL_BEGIN]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[DOTPH]], %[[EXIT_PEEL_BEGIN_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[DOTPH1]], %[[EXIT_PEEL_BEGIN_LOOPEXIT]] ]
+; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
+; CHECK:   ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/173420