[llvm] [LoopPeel] Peel last iteration to enable load widening (PR #173420)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 23 13:36:33 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Guy David (guy-david)
<details>
<summary>Changes</summary>
In loops that contain multiple consecutive small loads (e.g., 3 bytes loading i8's), peeling the last iteration makes it safe to read beyond the accessed region, enabling the use of a wider load (e.g., i32) for all other N-1 iterations.
Patterns such as:
```
%a = load i8, ptr %p
%b = load i8, ptr %p+1
%c = load i8, ptr %p+2
...
%p.next = getelementptr i8, ptr %p, 3
```
Can be transformed to:
```
%wide = load i32, ptr %p ; Read 4 bytes
%a = trunc
%b = lshr + trunc
%c = lshr + trunc
...
```
This acts as a fallback strategy when vectorization fails and has significant performance uplifts, most notably in image processing where processing an RGB buffer is common.
---
Patch is 50.22 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/173420.diff
6 Files Affected:
- (modified) llvm/include/llvm/Transforms/Utils/LoopPeel.h (+7-1)
- (modified) llvm/include/llvm/Transforms/Utils/UnrollLoop.h (+10-8)
- (modified) llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp (+24-10)
- (modified) llvm/lib/Transforms/Utils/LoopPeel.cpp (+230-1)
- (added) llvm/test/Transforms/LoopUnroll/peel-last-iteration-load-widening-be.ll (+104)
- (added) llvm/test/Transforms/LoopUnroll/peel-last-iteration-load-widening.ll (+616)
``````````diff
diff --git a/llvm/include/llvm/Transforms/Utils/LoopPeel.h b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
index 49dbc9aa1f2a9..9daf4789082cc 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopPeel.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
@@ -46,7 +46,13 @@ void computePeelCount(Loop *L, unsigned LoopSize,
unsigned TripCount, DominatorTree &DT,
ScalarEvolution &SE, const TargetTransformInfo &TTI,
AssumptionCache *AC = nullptr,
- unsigned Threshold = UINT_MAX);
+ unsigned Threshold = UINT_MAX,
+ bool AllowLoadWideningPeel = true);
+
+/// Combine load instructions in a loop into a wider one, given that we peeled
+/// the last iteration and can assume the bytes are dereferenceable.
+bool widenLoadsAfterPeel(Loop &L, ScalarEvolution &SE, const DataLayout &DL,
+ const TargetTransformInfo &TTI, DominatorTree &DT);
} // end namespace llvm
diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index a3efc43c62dc3..073f8c26c664e 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -157,14 +157,16 @@ class UnrollCostEstimator {
unsigned CountOverwrite = 0) const;
};
-LLVM_ABI bool computeUnrollCount(
- Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
- AssumptionCache *AC, ScalarEvolution &SE,
- const SmallPtrSetImpl<const Value *> &EphValues,
- OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount,
- bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE,
- TargetTransformInfo::UnrollingPreferences &UP,
- TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound);
+LLVM_ABI bool
+computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT,
+ LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE,
+ const SmallPtrSetImpl<const Value *> &EphValues,
+ OptimizationRemarkEmitter *ORE, unsigned TripCount,
+ unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple,
+ const UnrollCostEstimator &UCE,
+ TargetTransformInfo::UnrollingPreferences &UP,
+ TargetTransformInfo::PeelingPreferences &PP,
+ bool &UseUpperBound, bool AllowLoadWideningPeel = true);
LLVM_ABI std::optional<RecurrenceDescriptor>
canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 802ae4e9c28e3..6ecc50d741ce2 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -921,14 +921,17 @@ shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
// many LoopUnroll-specific options. The shared functionality should be
// refactored into it own function.
-bool llvm::computeUnrollCount(
- Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
- AssumptionCache *AC, ScalarEvolution &SE,
- const SmallPtrSetImpl<const Value *> &EphValues,
- OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount,
- bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE,
- TargetTransformInfo::UnrollingPreferences &UP,
- TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
+bool llvm::computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
+ DominatorTree &DT, LoopInfo *LI,
+ AssumptionCache *AC, ScalarEvolution &SE,
+ const SmallPtrSetImpl<const Value *> &EphValues,
+ OptimizationRemarkEmitter *ORE,
+ unsigned TripCount, unsigned MaxTripCount,
+ bool MaxOrZero, unsigned TripMultiple,
+ const UnrollCostEstimator &UCE,
+ TargetTransformInfo::UnrollingPreferences &UP,
+ TargetTransformInfo::PeelingPreferences &PP,
+ bool &UseUpperBound, bool AllowLoadWideningPeel) {
unsigned LoopSize = UCE.getRolledLoopSize();
@@ -1014,7 +1017,8 @@ bool llvm::computeUnrollCount(
}
// 5th priority is loop peeling.
- computePeelCount(L, LoopSize, PP, TripCount, DT, SE, TTI, AC, UP.Threshold);
+ computePeelCount(L, LoopSize, PP, TripCount, DT, SE, TTI, AC, UP.Threshold,
+ AllowLoadWideningPeel);
if (PP.PeelCount) {
UP.Runtime = false;
UP.Count = 1;
@@ -1293,10 +1297,14 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
// computeUnrollCount() decides whether it is beneficial to use upper bound to
// fully unroll the loop.
+ // When OnlyFullUnroll is true, we're running before vectorization
+ // (LoopFullUnrollPass), so disable load widening peeling to avoid peeling
+ // loops that could have been vectorized instead.
bool UseUpperBound = false;
bool IsCountSetExplicitly = computeUnrollCount(
L, TTI, DT, LI, &AC, SE, EphValues, &ORE, TripCount, MaxTripCount,
- MaxOrZero, TripMultiple, UCE, UP, PP, UseUpperBound);
+ MaxOrZero, TripMultiple, UCE, UP, PP, UseUpperBound,
+ /*AllowLoadWideningPeel=*/!OnlyFullUnroll);
if (!UP.Count)
return LoopUnrollResult::Unmodified;
@@ -1316,6 +1324,12 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
ValueToValueMapTy VMap;
if (peelLoop(L, PP.PeelCount, PP.PeelLast, LI, &SE, DT, &AC, PreserveLCSSA,
VMap)) {
+ // Widen consecutive loads after last-iteration peeling
+ if (PP.PeelLast) {
+ const DataLayout &DL = L->getHeader()->getDataLayout();
+ widenLoadsAfterPeel(*L, SE, DL, TTI, DT);
+ }
+
simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI, nullptr);
// If the loop was peeled, we already "used up" the profile information
// we had, so we don't want to unroll or peel again.
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 960ec9d4c7d6e..b86f34a41ab6b 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -86,6 +86,11 @@ static cl::opt<bool> EnablePeelingForIV(
"enable-peeling-for-iv", cl::init(false), cl::Hidden,
cl::desc("Enable peeling to convert Phi nodes into IVs"));
+static cl::opt<bool> EnablePeelForLoadWidening(
+ "enable-peel-for-load-widening", cl::init(true), cl::Hidden,
+ cl::desc(
+ "Enable peeling last iteration to enable consecutive load widening"));
+
static const char *PeeledCountMetaData = "llvm.loop.peeled.count";
extern cl::opt<bool> ProfcheckDisableMetadataFixes;
@@ -746,13 +751,148 @@ static bool violatesLegacyMultiExitLoopCheck(Loop *L) {
});
}
+namespace {
+// Represents a group of loads in a loop that can be combined into a wider one.
+struct LoadGroup {
+ // Base object being read.
+ Value *BasePtr;
+ // First load instruction in the program order.
+ LoadInst *FirstLoad;
+ // Pairs of (load instruction, offset from base).
+ SmallVector<std::pair<LoadInst *, APInt>, 4> Loads;
+ // An applicable wider integer type to load as.
+ Type *WideType;
+};
+
+// Helper to compute load group span and validate for widening.
+static std::optional<LoadGroup> tryFormLoadGroupForWidening(
+ Value *Base, SmallVectorImpl<std::pair<LoadInst *, APInt>> &Loads, Loop &L,
+ ScalarEvolution &SE, const DataLayout &DL, const TargetTransformInfo &TTI) {
+ // Find the span of the loaded data.
+ int64_t Left = INT64_MAX;
+ int64_t Right = INT64_MIN;
+ for (const auto &[Load, Offset] : Loads) {
+ Left = std::min(Left, Offset.getSExtValue());
+ Right = std::max(
+ Right, Offset.getSExtValue() +
+ static_cast<int64_t>(DL.getTypeStoreSize(Load->getType())));
+ }
+ assert((Left < Right) && "Invalid load group span");
+ uint64_t TotalBytes = Right - Left;
+ uint64_t TotalBits = TotalBytes * 8;
+ // Powers of two are already natural for most targets.
+ if (isPowerOf2_64(TotalBits))
+ return std::nullopt;
+ Type *WideType =
+ DL.getSmallestLegalIntType(L.getHeader()->getContext(), TotalBits);
+ if (!WideType)
+ return std::nullopt;
+ unsigned WideBits = WideType->getIntegerBitWidth();
+ // Total size is already natural for the target.
+ if (WideBits == TotalBits)
+ return std::nullopt;
+ // Peeling doubles dereferenceable bytes, ensure wide type fits.
+ if (WideBits > TotalBits * 2)
+ return std::nullopt;
+ // Check alignment is unconstrained.
+ unsigned Fast = 0;
+ if (!TTI.allowsMisalignedMemoryAccesses(L.getHeader()->getContext(), WideBits,
+ DL.getDefaultGlobalsAddressSpace(),
+ Align(1), &Fast) &&
+ !Fast)
+ return std::nullopt;
+ // Validate pointer stride across iterations.
+ const SCEV *PtrSCEV = SE.getSCEV(Base);
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrSCEV);
+ if (!AR || AR->getLoop() != &L)
+ return std::nullopt;
+ const SCEV *Step = AR->getStepRecurrence(SE);
+ auto *ConstStep = dyn_cast<SCEVConstant>(Step);
+ if (!ConstStep)
+ return std::nullopt;
+ int64_t StepVal = ConstStep->getValue()->getSExtValue();
+ if (StepVal != static_cast<int64_t>(TotalBytes))
+ return std::nullopt;
+
+ LoadInst *FirstLoad = Loads[0].first;
+ llvm::sort(Loads, [](const auto &A, const auto &B) {
+ return A.second.slt(B.second);
+ });
+ return LoadGroup{Base, FirstLoad, std::move(Loads), WideType};
+}
+
+// Find groups of consecutive loads in a basic block for peeling purposes
+static SmallVector<LoadGroup>
+findLoadGroupsForWidening(BasicBlock *BB, Loop &L, ScalarEvolution &SE,
+ const DataLayout &DL,
+ const TargetTransformInfo &TTI) {
+ SmallVector<LoadGroup> Groups;
+ // Mapping from base pointer to loads instructions and their offset from the
+ // base.
+ DenseMap<Value *, SmallVector<std::pair<LoadInst *, APInt>>> LoadsByBase;
+
+ auto ProcessCollectedLoads = [&]() {
+ for (auto &[Base, Loads] : LoadsByBase) {
+ if (Loads.size() <= 1)
+ continue;
+ if (auto Group = tryFormLoadGroupForWidening(Base, Loads, L, SE, DL, TTI))
+ Groups.emplace_back(*std::move(Group));
+ }
+ LoadsByBase.clear();
+ };
+
+ for (Instruction &I : *BB) {
+ if (auto *Load = dyn_cast<LoadInst>(&I)) {
+ if (Load->isVolatile() || Load->isAtomic() ||
+ !Load->getType()->isIntegerTy() ||
+ Load->getPointerAddressSpace() != DL.getDefaultGlobalsAddressSpace())
+ continue;
+ Value *Ptr = Load->getPointerOperand();
+ APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+ Value *ActualBase = Ptr->stripAndAccumulateConstantOffsets(
+ DL, Offset, /*AllowNonInbounds=*/false);
+ LoadsByBase[ActualBase].emplace_back(Load, Offset);
+ } else if (I.mayHaveSideEffects())
+ ProcessCollectedLoads();
+ }
+ ProcessCollectedLoads();
+ return Groups;
+}
+
+// Returns 1 if peeling the last iteration would enable widening load groups to
+// natural sizes. Returns 0 otherwise.
+static unsigned peelLastForLoadWidening(Loop &L, ScalarEvolution &SE,
+ const DataLayout &DL,
+ const TargetTransformInfo &TTI,
+ DominatorTree &DT) {
+ if (!EnablePeelForLoadWidening)
+ return 0;
+ if (!L.isInnermost())
+ return 0;
+ if (!canPeelLastIteration(L, SE))
+ return 0;
+ BasicBlock *Latch = L.getLoopLatch();
+ if (!Latch)
+ return 0;
+ for (BasicBlock *BB : L.blocks()) {
+ // Look for consecutive loads in blocks that execute every iteration.
+ if (!DT.dominates(BB, Latch))
+ continue;
+ auto Groups = findLoadGroupsForWidening(BB, L, SE, DL, TTI);
+ if (!Groups.empty())
+ return 1;
+ }
+ return 0;
+}
+} // anonymous namespace
// Return the number of iterations we want to peel off.
void llvm::computePeelCount(Loop *L, unsigned LoopSize,
TargetTransformInfo::PeelingPreferences &PP,
unsigned TripCount, DominatorTree &DT,
ScalarEvolution &SE, const TargetTransformInfo &TTI,
- AssumptionCache *AC, unsigned Threshold) {
+ AssumptionCache *AC, unsigned Threshold,
+ bool AllowLoadWideningPeel) {
assert(LoopSize > 0 && "Zero loop size is not allowed!");
// Save the PP.PeelCount value set by the target in
// TTI.getPeelingPreferences or by the flag -unroll-peel-count.
@@ -852,6 +992,25 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
}
}
+ // Check for consecutive load widening opportunity.
+ // Skip this when running before vectorization (AllowLoadWideningPeel=false)
+ // to avoid peeling loops that could have been vectorized instead.
+ if (PP.PeelCount == 0 && AllowLoadWideningPeel) {
+ const DataLayout &DL = L->getHeader()->getDataLayout();
+ unsigned LoadWideningPeel = peelLastForLoadWidening(*L, SE, DL, TTI, DT);
+ if (LoadWideningPeel > 0) {
+ if (LoadWideningPeel + AlreadyPeeled <= UnrollPeelMaxCount) {
+ LLVM_DEBUG(
+ dbgs() << "Peel last " << LoadWideningPeel
+ << " iteration(s) to enable consecutive load widening.\n");
+ PP.PeelCount = LoadWideningPeel;
+ PP.PeelProfiledIterations = false;
+ PP.PeelLast = true;
+ return;
+ }
+ }
+ }
+
// Bail if we know the statically calculated trip count.
// In this case we rather prefer partial unrolling.
if (TripCount)
@@ -890,6 +1049,76 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
}
}
+bool llvm::widenLoadsAfterPeel(Loop &L, ScalarEvolution &SE,
+ const DataLayout &DL,
+ const TargetTransformInfo &TTI,
+ DominatorTree &DT) {
+ BasicBlock *Latch = L.getLoopLatch();
+ if (!Latch)
+ return false;
+
+ bool Changed = false;
+
+ for (BasicBlock *BB : L.blocks()) {
+ if (!DT.dominates(BB, Latch))
+ continue;
+ SmallVector<LoadGroup> Groups =
+ findLoadGroupsForWidening(BB, L, SE, DL, TTI);
+ for (const LoadGroup &Group : Groups) {
+ LoadInst *InsertPoint = Group.FirstLoad;
+ IRBuilder<> Builder(InsertPoint);
+ Value *BasePtr = Group.BasePtr;
+ int64_t FirstOffset = Group.Loads[0].second.getSExtValue();
+ // If the first load doesn't start at offset 0, we need to adjust.
+ if (FirstOffset != 0) {
+ Value *OrigPtr = Group.BasePtr;
+ BasePtr = Builder.CreatePtrAdd(
+ OrigPtr,
+ ConstantInt::get(
+ Builder.getIndexTy(DL, DL.getDefaultGlobalsAddressSpace()),
+ FirstOffset));
+ }
+ // Merge AA metadata from all loads.
+ AAMDNodes AATags = InsertPoint->getAAMetadata();
+ for (const auto &[Load, Offset] : Group.Loads) {
+ if (Load != InsertPoint)
+ AATags = AATags.concat(Load->getAAMetadata());
+ }
+ // Create the wider load.
+ LoadInst *WideLoad = Builder.CreateLoad(Group.WideType, BasePtr);
+ unsigned SizeInBits = WideLoad->getType()->getScalarSizeInBits();
+ if (AATags)
+ WideLoad->setAAMetadata(AATags);
+ // For each original load, extract the corresponding bytes.
+ for (const auto &[Load, Offset] : Group.Loads) {
+ unsigned LoadBytes = DL.getTypeStoreSize(Load->getType());
+ Value *Extracted = WideLoad;
+ unsigned BitOffset =
+ DL.isBigEndian()
+ ? SizeInBits -
+ (Offset.getSExtValue() - FirstOffset + LoadBytes) * 8
+ : (Offset.getSExtValue() - FirstOffset) * 8;
+ if (BitOffset != 0)
+ Extracted = Builder.CreateLShr(
+ Extracted, ConstantInt::get(WideLoad->getType(), BitOffset));
+ unsigned TargetBits = Load->getType()->getScalarSizeInBits();
+ if (TargetBits < SizeInBits)
+ Extracted = Builder.CreateTrunc(Extracted, Load->getType());
+ Load->replaceAllUsesWith(Extracted);
+ }
+ // Delete the original loads.
+ for (auto &[Load, Offset] : Group.Loads)
+ Load->eraseFromParent();
+
+ LLVM_DEBUG(dbgs() << "Widened " << Group.Loads.size()
+ << " loads into a single " << *WideLoad << "\n");
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
/// Clones the body of the loop L, putting it between \p InsertTop and \p
/// InsertBot.
/// \param IterNumber The serial number of the iteration currently being
diff --git a/llvm/test/Transforms/LoopUnroll/peel-last-iteration-load-widening-be.ll b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-load-widening-be.ll
new file mode 100644
index 0000000000000..173130deae048
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-load-widening-be.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=aarch64_be -passes=loop-unroll -S %s | FileCheck %s
+
+; Test that loop peeling for load widening works correctly on big-endian targets.
+; The byte extraction shifts should be different from little-endian.
+
+target datalayout = "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; Test 3-byte consecutive loads on big-endian, should peel and use i32 load.
+; For big-endian with i32 load:
+; byte 0 (lowest address) is in bits [31:24]
+; byte 1 is in bits [23:16]
+; byte 2 is in bits [15:8]
+; byte 3 (unused) is in bits [7:0]
+define void @test_3_consecutive_loads_be(ptr %src, ptr %dst, i32 %n) {
+; CHECK-LABEL: define void @test_3_consecutive_loads_be(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT: br i1 [[TMP1]], label %[[ENTRY_SPLIT:.*]], label %[[EXIT_PEEL_BEGIN:.*]]
+; CHECK: [[ENTRY_SPLIT]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[P:%.*]] = phi ptr [ [[SRC]], %[[ENTRY_SPLIT]] ], [ [[P_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP2]], 24
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
+; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP5]] to i8
+; CHECK-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP2]], 8
+; CHECK-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
+; CHECK-NEXT: [[SUM1:%.*]] = add i8 [[TMP4]], [[TMP6]]
+; CHECK-NEXT: [[SUM2:%.*]] = add i8 [[SUM1]], [[TMP8]]
+; CHECK-NEXT: [[DST_I:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[I]]
+; CHECK-NEXT: store i8 [[SUM2]], ptr [[DST_I]], align 1
+; CHECK-NEXT: [[P_NEXT]] = getelementptr inbounds i8, ptr [[P]], i64 3
+; CHECK-NEXT: [[I_NEXT]] = add nuw i32 [[I]], 1
+; CHECK-NEXT: [[TMP9:%.*]] = sub i32 [[N]], 1
+; CHECK-NEXT: [[COND:%.*]] = icmp ne i32 [[I_NEXT]], [[TMP9]]
+; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT_PEEL_BEGIN_LOOPEXIT:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[EXIT_PEEL_BEGIN_LOOPEXIT]]:
+; CHECK-NEXT: [[DOTPH:%.*]] = phi i32 [ [[I_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[DOTPH1:%.*]] = phi ptr [ [[P_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: br label %[[EXIT_PEEL_BEGIN]]
+; CHECK: [[EXIT_PEEL_BEGIN]]:
+; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[DOTPH]], %[[EXIT_PEEL_BEGIN_LOOPEXIT]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[DOTPH1]], %[[EXIT_PEEL_BEGIN_LOOPEXIT]] ]
+; CHECK-NEXT: br label %[[LOOP_PEEL:.*]]
+; CHECK: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/173420
More information about the llvm-commits
mailing list