[llvm] [AArch64] Add flag to control unrolling for small multi-exit loops (PR #131998)
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 31 06:40:34 PDT 2025
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/131998
>From 9ebd338d8f936ec3febf700df6f91109c69f4bca Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 19 Mar 2025 10:22:25 +0000
Subject: [PATCH 1/5] [AArch64] Add flag to control unrolling for small
multi-exit loops
It can be highly beneficial to unroll small, two-block search loops
that look for a value in an array. An example of this would be
something that uses std::find to find a value in libc++. Older
versions of std::find in the libstdc++ headers are manually unrolled
in the source code, but this might change in newer releases where
the compiler is expected to either vectorise or unroll itself.
This patch adds a new flag -small-multi-exit-loop-unroll-factor
that controls the amount of unrolling for such loops. This is
currently off by default, but in a future patch I plan to enable
this for some targets along with details of any performance
improvements.
---
.../AArch64/AArch64TargetTransformInfo.cpp | 90 ++-
.../AArch64/unrolling-multi-exit.ll | 713 ++++++++++++++++++
2 files changed, 793 insertions(+), 10 deletions(-)
create mode 100644 llvm/test/Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e320b0e653ad4..38fcc3b9eab93 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -68,6 +68,11 @@ static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
cl::init(true), cl::Hidden);
+static cl::opt<unsigned> SmallMultiExitLoopUF(
+ "small-multi-exit-loop-unroll-factor", cl::init(0), cl::Hidden,
+ cl::desc(
+ "Force unrolling of small multi-exit loops with given unroll factor"));
+
// A complete guess as to a reasonable cost.
static cl::opt<unsigned>
BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
@@ -4375,6 +4380,70 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
}
+static bool shouldUnrollLoopWithInstruction(Instruction &I,
+ AArch64TTIImpl &TTI) {
+ // Don't unroll vectorised loop.
+ if (I.getType()->isVectorTy())
+ return false;
+
+ if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+ if (const Function *F = cast<CallBase>(I).getCalledFunction())
+ if (!TTI.isLoweredToCall(F))
+ return true;
+ return false;
+ }
+
+ return true;
+}
+
+static bool shouldUnrollSmallMultiExitLoop(Loop *L, ScalarEvolution &SE,
+ AArch64TTIImpl &TTI) {
+ // Small search loops with multiple exits can be highly beneficial to unroll.
+ // We only care about loops with exactly two exiting blocks, although each
+ // block could jump to the same exit block.
+ SmallVector<BasicBlock *> Blocks(L->getBlocks());
+ if (Blocks.size() != 2 || L->getExitingBlock())
+ return false;
+
+ if (any_of(Blocks, [](BasicBlock *BB) {
+ return !isa<BranchInst>(BB->getTerminator());
+ }))
+ return false;
+
+ // Only consider loops with unknown trip counts for which we can determine
+ // a symbolic expression. Multi-exit loops with small known trip counts will
+ // likely be unrolled anyway.
+ const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
+ if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
+ return false;
+
+ // It might not be worth unrolling loops with low max trip counts. Restrict
+ // this to max trip counts > 32 for now.
+ unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
+ if (MaxTC > 0 && MaxTC <= 32)
+ return false;
+
+ // Estimate the size of the loop.
+ int64_t Size = 0;
+ for (auto *BB : L->getBlocks()) {
+ for (auto &I : *BB) {
+ if (!shouldUnrollLoopWithInstruction(I, TTI))
+ return false;
+
+ SmallVector<const Value *, 4> Operands(I.operand_values());
+ InstructionCost Cost =
+ TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
+ // This can happen with intrinsics that don't currently have a cost model
+ // or for some operations that require SVE.
+ if (!Cost.isValid())
+ return false;
+ Size += *Cost.getValue();
+ }
+ }
+
+ return Size < 6;
+}
+
/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
/// OOO engine's wide instruction window and various predictors.
static void
@@ -4550,22 +4619,23 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
break;
}
+ if (SmallMultiExitLoopUF && shouldUnrollSmallMultiExitLoop(L, SE, *this)) {
+ UP.RuntimeUnrollMultiExit = true;
+ UP.Runtime = true;
+ // Limit unroll count.
+ UP.DefaultUnrollRuntimeCount = SmallMultiExitLoopUF;
+ // Allow slightly more costly trip-count expansion to catch search loops
+ // with pointer inductions.
+ UP.SCEVExpansionBudget = 5;
+ }
+
// Scan the loop: don't unroll loops with calls as this could prevent
// inlining. Don't unroll vector loops either, as they don't benefit much from
// unrolling.
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
- // Don't unroll vectorised loop.
- if (I.getType()->isVectorTy())
+ if (!shouldUnrollLoopWithInstruction(I, *this))
return;
-
- if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
- if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
- if (!isLoweredToCall(F))
- continue;
- }
- return;
- }
}
}
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll b/llvm/test/Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll
new file mode 100644
index 0000000000000..b799b4328400a
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll
@@ -0,0 +1,713 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -mcpu=generic -small-multi-exit-loop-unroll-factor=2 -S %s | FileCheck --check-prefixes=COMMON,UNROLL2 %s
+; RUN: opt -p loop-unroll -mcpu=generic -S %s | FileCheck --check-prefixes=COMMON,GENERIC %s
+
+target triple = "aarch64-linux-gnu"
+
+define i1 @multi_2_exiting_find_i8_loop_same_exit(ptr %vec, i8 %tgt) {
+; UNROLL2-LABEL: define i1 @multi_2_exiting_find_i8_loop_same_exit(
+; UNROLL2-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0:[0-9]+]] {
+; UNROLL2-NEXT: [[ENTRY:.*]]:
+; UNROLL2-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; UNROLL2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; UNROLL2-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; UNROLL2-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; UNROLL2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; UNROLL2-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]]
+; UNROLL2-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]]
+; UNROLL2-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], -1
+; UNROLL2-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP1]], 1
+; UNROLL2-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; UNROLL2-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_HEADER_PROL_PREHEADER:.*]], label %[[LOOP_HEADER_PROL_LOOPEXIT:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL_PREHEADER]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL]]:
+; UNROLL2-NEXT: [[L_PROL:%.*]] = load i8, ptr [[START]], align 8
+; UNROLL2-NEXT: [[C_1_PROL:%.*]] = icmp eq i8 [[L_PROL]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_PROL]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP_LATCH_PROL:.*]]
+; UNROLL2: [[LOOP_LATCH_PROL]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[START]], i64 1
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL_LOOPEXIT]]
+; UNROLL2: [[LOOP_HEADER_PROL_LOOPEXIT]]:
+; UNROLL2-NEXT: [[RES_UNR:%.*]] = phi ptr [ poison, %[[ENTRY]] ], [ [[END]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT: [[PTR_IV_UNR:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT_PROL]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 1
+; UNROLL2-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[ENTRY_NEW:.*]]
+; UNROLL2: [[ENTRY_NEW]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER:.*]]
+; UNROLL2: [[LOOP_HEADER]]:
+; UNROLL2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_UNR]], %[[ENTRY_NEW]] ], [ [[PTR_IV_NEXT_1:%.*]], %[[LOOP_LATCH_1:.*]] ]
+; UNROLL2-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; UNROLL2-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP_LATCH:.*]]
+; UNROLL2: [[LOOP_LATCH]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; UNROLL2-NEXT: [[L_1:%.*]] = load i8, ptr [[PTR_IV_NEXT]], align 8
+; UNROLL2-NEXT: [[C_1_1:%.*]] = icmp eq i8 [[L_1]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_LATCH_1]]
+; UNROLL2: [[LOOP_LATCH_1]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_1]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT]], i64 1
+; UNROLL2-NEXT: [[C_2_1:%.*]] = icmp eq ptr [[PTR_IV_NEXT_1]], [[END]]
+; UNROLL2-NEXT: br i1 [[C_2_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_HEADER]]
+; UNROLL2: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; UNROLL2-NEXT: [[RES_PH_PH:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[PTR_IV_NEXT]], %[[LOOP_LATCH]] ], [ [[END]], %[[LOOP_LATCH_1]] ]
+; UNROLL2-NEXT: br label %[[EXIT_UNR_LCSSA]]
+; UNROLL2: [[EXIT_UNR_LCSSA]]:
+; UNROLL2-NEXT: [[RES_PH:%.*]] = phi ptr [ [[START]], %[[LOOP_HEADER_PROL]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; UNROLL2-NEXT: br label %[[EXIT]]
+; UNROLL2: [[EXIT]]:
+; UNROLL2-NEXT: [[RES:%.*]] = phi ptr [ [[RES_UNR]], %[[LOOP_HEADER_PROL_LOOPEXIT]] ], [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ]
+; UNROLL2-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; UNROLL2-NEXT: ret i1 [[C_3]]
+;
+; GENERIC-LABEL: define i1 @multi_2_exiting_find_i8_loop_same_exit(
+; GENERIC-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0:[0-9]+]] {
+; GENERIC-NEXT: [[ENTRY:.*]]:
+; GENERIC-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; GENERIC-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; GENERIC-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; GENERIC-NEXT: br label %[[LOOP_HEADER:.*]]
+; GENERIC: [[LOOP_HEADER]]:
+; GENERIC-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; GENERIC-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; GENERIC-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; GENERIC-NEXT: br i1 [[C_1]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; GENERIC: [[LOOP_LATCH]]:
+; GENERIC-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; GENERIC-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; GENERIC-NEXT: br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; GENERIC: [[EXIT]]:
+; GENERIC-NEXT: [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; GENERIC-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; GENERIC-NEXT: ret i1 [[C_3]]
+;
+entry:
+ %start = load ptr, ptr %vec, align 8
+ %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 1
+ %end = load ptr, ptr %gep.end, align 8
+ br label %loop.header
+
+loop.header:
+ %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+ %l = load i8, ptr %ptr.iv, align 8
+ %c.1 = icmp eq i8 %l, %tgt
+ br i1 %c.1, label %exit, label %loop.latch
+
+loop.latch:
+ %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 1
+ %c.2 = icmp eq ptr %ptr.iv.next, %end
+ br i1 %c.2, label %exit, label %loop.header
+
+exit:
+ %res = phi ptr [ %ptr.iv, %loop.header ], [ %end, %loop.latch ]
+ %c.3 = icmp eq ptr %res, %end
+ ret i1 %c.3
+}
+
+
+define i1 @multi_2_exiting_find_i8_loop_diff_exit(ptr %vec, i8 %tgt) {
+; UNROLL2-LABEL: define i1 @multi_2_exiting_find_i8_loop_diff_exit(
+; UNROLL2-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0]] {
+; UNROLL2-NEXT: [[ENTRY:.*]]:
+; UNROLL2-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; UNROLL2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; UNROLL2-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; UNROLL2-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; UNROLL2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; UNROLL2-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]]
+; UNROLL2-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]]
+; UNROLL2-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], -1
+; UNROLL2-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP1]], 1
+; UNROLL2-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; UNROLL2-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_HEADER_PROL_PREHEADER:.*]], label %[[LOOP_HEADER_PROL_LOOPEXIT:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL_PREHEADER]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL]]:
+; UNROLL2-NEXT: [[L_PROL:%.*]] = load i8, ptr [[START]], align 8
+; UNROLL2-NEXT: [[C_1_PROL:%.*]] = icmp eq i8 [[L_PROL]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_PROL]], label %[[EARLY_EXIT:.*]], label %[[LOOP_LATCH_PROL:.*]]
+; UNROLL2: [[LOOP_LATCH_PROL]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[START]], i64 1
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL_LOOPEXIT]]
+; UNROLL2: [[LOOP_HEADER_PROL_LOOPEXIT]]:
+; UNROLL2-NEXT: [[PTR_IV_UNR:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT_PROL]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 1
+; UNROLL2-NEXT: br i1 [[TMP3]], label %[[LATCH_EXIT:.*]], label %[[ENTRY_NEW:.*]]
+; UNROLL2: [[ENTRY_NEW]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER:.*]]
+; UNROLL2: [[LOOP_HEADER]]:
+; UNROLL2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_UNR]], %[[ENTRY_NEW]] ], [ [[PTR_IV_NEXT_1:%.*]], %[[LOOP_LATCH_1:.*]] ]
+; UNROLL2-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; UNROLL2-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1]], label %[[EARLY_EXIT_LOOPEXIT:.*]], label %[[LOOP_LATCH:.*]]
+; UNROLL2: [[LOOP_LATCH]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; UNROLL2-NEXT: [[L_1:%.*]] = load i8, ptr [[PTR_IV_NEXT]], align 8
+; UNROLL2-NEXT: [[C_1_1:%.*]] = icmp eq i8 [[L_1]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_1]], label %[[EARLY_EXIT_LOOPEXIT]], label %[[LOOP_LATCH_1]]
+; UNROLL2: [[LOOP_LATCH_1]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_1]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT]], i64 1
+; UNROLL2-NEXT: [[C_2_1:%.*]] = icmp eq ptr [[PTR_IV_NEXT_1]], [[END]]
+; UNROLL2-NEXT: br i1 [[C_2_1]], label %[[LATCH_EXIT_UNR_LCSSA:.*]], label %[[LOOP_HEADER]]
+; UNROLL2: [[EARLY_EXIT_LOOPEXIT]]:
+; UNROLL2-NEXT: br label %[[EARLY_EXIT]]
+; UNROLL2: [[EARLY_EXIT]]:
+; UNROLL2-NEXT: ret i1 true
+; UNROLL2: [[LATCH_EXIT_UNR_LCSSA]]:
+; UNROLL2-NEXT: br label %[[LATCH_EXIT]]
+; UNROLL2: [[LATCH_EXIT]]:
+; UNROLL2-NEXT: ret i1 false
+;
+; GENERIC-LABEL: define i1 @multi_2_exiting_find_i8_loop_diff_exit(
+; GENERIC-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0]] {
+; GENERIC-NEXT: [[ENTRY:.*]]:
+; GENERIC-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; GENERIC-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; GENERIC-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; GENERIC-NEXT: br label %[[LOOP_HEADER:.*]]
+; GENERIC: [[LOOP_HEADER]]:
+; GENERIC-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; GENERIC-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; GENERIC-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; GENERIC-NEXT: br i1 [[C_1]], label %[[EARLY_EXIT:.*]], label %[[LOOP_LATCH]]
+; GENERIC: [[LOOP_LATCH]]:
+; GENERIC-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; GENERIC-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; GENERIC-NEXT: br i1 [[C_2]], label %[[LATCH_EXIT:.*]], label %[[LOOP_HEADER]]
+; GENERIC: [[EARLY_EXIT]]:
+; GENERIC-NEXT: ret i1 true
+; GENERIC: [[LATCH_EXIT]]:
+; GENERIC-NEXT: ret i1 false
+;
+entry:
+ %start = load ptr, ptr %vec, align 8
+ %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 1
+ %end = load ptr, ptr %gep.end, align 8
+ br label %loop.header
+
+loop.header:
+ %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+ %l = load i8, ptr %ptr.iv, align 8
+ %c.1 = icmp eq i8 %l, %tgt
+ br i1 %c.1, label %early.exit, label %loop.latch
+
+loop.latch:
+ %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 1
+ %c.2 = icmp eq ptr %ptr.iv.next, %end
+ br i1 %c.2, label %latch.exit, label %loop.header
+
+early.exit:
+ ret i1 1
+
+latch.exit:
+ ret i1 0
+}
+
+
+define i1 @multi_2_exiting_find_ptr_loop_same_exit(ptr %vec, ptr %tgt) {
+; UNROLL2-LABEL: define i1 @multi_2_exiting_find_ptr_loop_same_exit(
+; UNROLL2-SAME: ptr [[VEC:%.*]], ptr [[TGT:%.*]]) #[[ATTR0]] {
+; UNROLL2-NEXT: [[ENTRY:.*]]:
+; UNROLL2-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; UNROLL2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; UNROLL2-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[START]], i64 8) ]
+; UNROLL2-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 8
+; UNROLL2-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; UNROLL2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; UNROLL2-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; UNROLL2-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8
+; UNROLL2-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; UNROLL2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
+; UNROLL2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; UNROLL2-NEXT: [[TMP4:%.*]] = freeze i64 [[TMP3]]
+; UNROLL2-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], -1
+; UNROLL2-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP4]], 1
+; UNROLL2-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; UNROLL2-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_HEADER_PROL_PREHEADER:.*]], label %[[LOOP_HEADER_PROL_LOOPEXIT:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL_PREHEADER]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL]]:
+; UNROLL2-NEXT: [[L_PROL:%.*]] = load ptr, ptr [[START]], align 8
+; UNROLL2-NEXT: [[C_1_PROL:%.*]] = icmp eq ptr [[L_PROL]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_PROL]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP_LATCH_PROL:.*]]
+; UNROLL2: [[LOOP_LATCH_PROL]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[START]], i64 8
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL_LOOPEXIT]]
+; UNROLL2: [[LOOP_HEADER_PROL_LOOPEXIT]]:
+; UNROLL2-NEXT: [[RES_UNR:%.*]] = phi ptr [ poison, %[[ENTRY]] ], [ [[END]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT: [[PTR_IV_UNR:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT_PROL]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 1
+; UNROLL2-NEXT: br i1 [[TMP6]], label %[[EXIT:.*]], label %[[ENTRY_NEW:.*]]
+; UNROLL2: [[ENTRY_NEW]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER:.*]]
+; UNROLL2: [[LOOP_HEADER]]:
+; UNROLL2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_UNR]], %[[ENTRY_NEW]] ], [ [[PTR_IV_NEXT_1:%.*]], %[[LOOP_LATCH_1:.*]] ]
+; UNROLL2-NEXT: [[L:%.*]] = load ptr, ptr [[PTR_IV]], align 8
+; UNROLL2-NEXT: [[C_1:%.*]] = icmp eq ptr [[L]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP_LATCH:.*]]
+; UNROLL2: [[LOOP_LATCH]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 8
+; UNROLL2-NEXT: [[L_1:%.*]] = load ptr, ptr [[PTR_IV_NEXT]], align 8
+; UNROLL2-NEXT: [[C_1_1:%.*]] = icmp eq ptr [[L_1]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_LATCH_1]]
+; UNROLL2: [[LOOP_LATCH_1]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_1]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT]], i64 8
+; UNROLL2-NEXT: [[C_2_1:%.*]] = icmp eq ptr [[PTR_IV_NEXT_1]], [[END]]
+; UNROLL2-NEXT: br i1 [[C_2_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_HEADER]]
+; UNROLL2: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; UNROLL2-NEXT: [[RES_PH_PH:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[PTR_IV_NEXT]], %[[LOOP_LATCH]] ], [ [[END]], %[[LOOP_LATCH_1]] ]
+; UNROLL2-NEXT: br label %[[EXIT_UNR_LCSSA]]
+; UNROLL2: [[EXIT_UNR_LCSSA]]:
+; UNROLL2-NEXT: [[RES_PH:%.*]] = phi ptr [ [[START]], %[[LOOP_HEADER_PROL]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; UNROLL2-NEXT: br label %[[EXIT]]
+; UNROLL2: [[EXIT]]:
+; UNROLL2-NEXT: [[RES:%.*]] = phi ptr [ [[RES_UNR]], %[[LOOP_HEADER_PROL_LOOPEXIT]] ], [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ]
+; UNROLL2-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; UNROLL2-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; UNROLL2-NEXT: ret i1 [[C_3]]
+;
+; GENERIC-LABEL: define i1 @multi_2_exiting_find_ptr_loop_same_exit(
+; GENERIC-SAME: ptr [[VEC:%.*]], ptr [[TGT:%.*]]) #[[ATTR0]] {
+; GENERIC-NEXT: [[ENTRY:.*]]:
+; GENERIC-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; GENERIC-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[START]], i64 8) ]
+; GENERIC-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 8
+; GENERIC-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; GENERIC-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; GENERIC-NEXT: br label %[[LOOP_HEADER:.*]]
+; GENERIC: [[LOOP_HEADER]]:
+; GENERIC-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; GENERIC-NEXT: [[L:%.*]] = load ptr, ptr [[PTR_IV]], align 8
+; GENERIC-NEXT: [[C_1:%.*]] = icmp eq ptr [[L]], [[TGT]]
+; GENERIC-NEXT: br i1 [[C_1]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; GENERIC: [[LOOP_LATCH]]:
+; GENERIC-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 8
+; GENERIC-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; GENERIC-NEXT: br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; GENERIC: [[EXIT]]:
+; GENERIC-NEXT: [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; GENERIC-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; GENERIC-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; GENERIC-NEXT: ret i1 [[C_3]]
+;
+entry:
+ %start = load ptr, ptr %vec, align 8
+ call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 8) ]
+ %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 8
+ %end = load ptr, ptr %gep.end, align 8
+ call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 8) ]
+ br label %loop.header
+
+loop.header:
+ %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+ %l = load ptr, ptr %ptr.iv, align 8
+ %c.1 = icmp eq ptr %l, %tgt
+ br i1 %c.1, label %exit, label %loop.latch
+
+loop.latch:
+ %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 8
+ %c.2 = icmp eq ptr %ptr.iv.next, %end
+ br i1 %c.2, label %exit, label %loop.header
+
+exit:
+ %res = phi ptr [ %ptr.iv, %loop.header ], [ %end, %loop.latch ]
+ call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 8) ]
+ %c.3 = icmp eq ptr %res, %end
+ ret i1 %c.3
+}
+
+
+define ptr @multi_2_exiting_find_ptr_loop_diff_exit(ptr %vec, ptr %tgt) {
+; UNROLL2-LABEL: define ptr @multi_2_exiting_find_ptr_loop_diff_exit(
+; UNROLL2-SAME: ptr [[VEC:%.*]], ptr [[TGT:%.*]]) #[[ATTR0]] {
+; UNROLL2-NEXT: [[ENTRY:.*]]:
+; UNROLL2-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; UNROLL2-NEXT: [[START3:%.*]] = ptrtoint ptr [[START]] to i64
+; UNROLL2-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[START]], i64 8) ]
+; UNROLL2-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 8
+; UNROLL2-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; UNROLL2-NEXT: [[END2:%.*]] = ptrtoint ptr [[END]] to i64
+; UNROLL2-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; UNROLL2-NEXT: [[TMP0:%.*]] = add i64 [[END2]], -8
+; UNROLL2-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START3]]
+; UNROLL2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
+; UNROLL2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; UNROLL2-NEXT: [[TMP4:%.*]] = freeze i64 [[TMP3]]
+; UNROLL2-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], -1
+; UNROLL2-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP4]], 1
+; UNROLL2-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; UNROLL2-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_HEADER_PROL_PREHEADER:.*]], label %[[LOOP_HEADER_PROL_LOOPEXIT:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL_PREHEADER]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL]]:
+; UNROLL2-NEXT: [[L_PROL:%.*]] = load ptr, ptr [[START]], align 8
+; UNROLL2-NEXT: [[C_1_PROL:%.*]] = icmp eq ptr [[L_PROL]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_PROL]], label %[[EARLY_EXIT:.*]], label %[[LOOP_LATCH_PROL:.*]]
+; UNROLL2: [[LOOP_LATCH_PROL]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[START]], i64 8
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL_LOOPEXIT]]
+; UNROLL2: [[LOOP_HEADER_PROL_LOOPEXIT]]:
+; UNROLL2-NEXT: [[PTR_IV_UNR:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT_PROL]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 1
+; UNROLL2-NEXT: br i1 [[TMP6]], label %[[LATCH_EXIT:.*]], label %[[ENTRY_NEW:.*]]
+; UNROLL2: [[ENTRY_NEW]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER:.*]]
+; UNROLL2: [[LOOP_HEADER]]:
+; UNROLL2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_UNR]], %[[ENTRY_NEW]] ], [ [[PTR_IV_NEXT_1:%.*]], %[[LOOP_LATCH_1:.*]] ]
+; UNROLL2-NEXT: [[L:%.*]] = load ptr, ptr [[PTR_IV]], align 8
+; UNROLL2-NEXT: [[C_1:%.*]] = icmp eq ptr [[L]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1]], label %[[EARLY_EXIT_LOOPEXIT:.*]], label %[[LOOP_LATCH:.*]]
+; UNROLL2: [[LOOP_LATCH]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 8
+; UNROLL2-NEXT: [[L_1:%.*]] = load ptr, ptr [[PTR_IV_NEXT]], align 8
+; UNROLL2-NEXT: [[C_1_1:%.*]] = icmp eq ptr [[L_1]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_1]], label %[[EARLY_EXIT_LOOPEXIT]], label %[[LOOP_LATCH_1]]
+; UNROLL2: [[LOOP_LATCH_1]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_1]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT]], i64 8
+; UNROLL2-NEXT: [[C_2_1:%.*]] = icmp eq ptr [[PTR_IV_NEXT_1]], [[END]]
+; UNROLL2-NEXT: br i1 [[C_2_1]], label %[[LATCH_EXIT_UNR_LCSSA:.*]], label %[[LOOP_HEADER]]
+; UNROLL2: [[EARLY_EXIT_LOOPEXIT]]:
+; UNROLL2-NEXT: [[L_LCSSA_PH:%.*]] = phi ptr [ [[L]], %[[LOOP_HEADER]] ], [ [[L_1]], %[[LOOP_LATCH]] ]
+; UNROLL2-NEXT: br label %[[EARLY_EXIT]]
+; UNROLL2: [[EARLY_EXIT]]:
+; UNROLL2-NEXT: [[L_LCSSA:%.*]] = phi ptr [ [[L_PROL]], %[[LOOP_HEADER_PROL]] ], [ [[L_LCSSA_PH]], %[[EARLY_EXIT_LOOPEXIT]] ]
+; UNROLL2-NEXT: ret ptr [[L_LCSSA]]
+; UNROLL2: [[LATCH_EXIT_UNR_LCSSA]]:
+; UNROLL2-NEXT: br label %[[LATCH_EXIT]]
+; UNROLL2: [[LATCH_EXIT]]:
+; UNROLL2-NEXT: ret ptr [[END]]
+;
+; GENERIC-LABEL: define ptr @multi_2_exiting_find_ptr_loop_diff_exit(
+; GENERIC-SAME: ptr [[VEC:%.*]], ptr [[TGT:%.*]]) #[[ATTR0]] {
+; GENERIC-NEXT: [[ENTRY:.*]]:
+; GENERIC-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; GENERIC-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[START]], i64 8) ]
+; GENERIC-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 8
+; GENERIC-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; GENERIC-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; GENERIC-NEXT: br label %[[LOOP_HEADER:.*]]
+; GENERIC: [[LOOP_HEADER]]:
+; GENERIC-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; GENERIC-NEXT: [[L:%.*]] = load ptr, ptr [[PTR_IV]], align 8
+; GENERIC-NEXT: [[C_1:%.*]] = icmp eq ptr [[L]], [[TGT]]
+; GENERIC-NEXT: br i1 [[C_1]], label %[[EARLY_EXIT:.*]], label %[[LOOP_LATCH]]
+; GENERIC: [[LOOP_LATCH]]:
+; GENERIC-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 8
+; GENERIC-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; GENERIC-NEXT: br i1 [[C_2]], label %[[LATCH_EXIT:.*]], label %[[LOOP_HEADER]]
+; GENERIC: [[EARLY_EXIT]]:
+; GENERIC-NEXT: [[L_LCSSA:%.*]] = phi ptr [ [[L]], %[[LOOP_HEADER]] ]
+; GENERIC-NEXT: ret ptr [[L_LCSSA]]
+; GENERIC: [[LATCH_EXIT]]:
+; GENERIC-NEXT: ret ptr [[END]]
+;
+entry:
+ %start = load ptr, ptr %vec, align 8
+ call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 8) ]
+ %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 8
+ %end = load ptr, ptr %gep.end, align 8
+ call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 8) ]
+ br label %loop.header
+
+loop.header:
+ %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+ %l = load ptr, ptr %ptr.iv, align 8
+ %c.1 = icmp eq ptr %l, %tgt
+ br i1 %c.1, label %early.exit, label %loop.latch
+
+loop.latch:
+ %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 8
+ %c.2 = icmp eq ptr %ptr.iv.next, %end
+ br i1 %c.2, label %latch.exit, label %loop.header
+
+early.exit:
+ ret ptr %l
+
+latch.exit:
+ ret ptr %end
+}
+
+
+define i1 @multi_2_exiting_find_i8_loop_too_large(ptr %vec, i8 %tgt) {
+; COMMON-LABEL: define i1 @multi_2_exiting_find_i8_loop_too_large(
+; COMMON-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0:[0-9]+]] {
+; COMMON-NEXT: [[ENTRY:.*]]:
+; COMMON-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; COMMON-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; COMMON-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; COMMON-NEXT: br label %[[LOOP_HEADER:.*]]
+; COMMON: [[LOOP_HEADER]]:
+; COMMON-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; COMMON-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; COMMON-NEXT: [[UDIV:%.*]] = udiv i8 [[L]], [[TGT]]
+; COMMON-NEXT: [[UDIV_2:%.*]] = udiv i8 [[UDIV]], 10
+; COMMON-NEXT: [[C_1:%.*]] = icmp eq i8 [[UDIV_2]], 2
+; COMMON-NEXT: br i1 [[C_1]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; COMMON: [[LOOP_LATCH]]:
+; COMMON-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; COMMON-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; COMMON-NEXT: br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; COMMON: [[EXIT]]:
+; COMMON-NEXT: [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; COMMON-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; COMMON-NEXT: ret i1 [[C_3]]
+;
+entry:
+ %start = load ptr, ptr %vec, align 8
+ %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 1
+ %end = load ptr, ptr %gep.end, align 8
+ br label %loop.header
+
+loop.header:
+ %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+ %l = load i8, ptr %ptr.iv, align 8
+ %udiv = udiv i8 %l, %tgt
+ %udiv.2 = udiv i8 %udiv, 10
+ %c.1 = icmp eq i8 %udiv.2, 2
+ br i1 %c.1, label %exit, label %loop.latch
+
+loop.latch:
+ %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 1
+ %c.2 = icmp eq ptr %ptr.iv.next, %end
+ br i1 %c.2, label %exit, label %loop.header
+
+exit:
+ %res = phi ptr [ %ptr.iv, %loop.header ], [ %end, %loop.latch ]
+ %c.3 = icmp eq ptr %res, %end
+ ret i1 %c.3
+}
+
+
+define i1 @multi_3_exiting_find_ptr_loop(ptr %vec, ptr %tgt, ptr %tgt2) {
+; COMMON-LABEL: define i1 @multi_3_exiting_find_ptr_loop(
+; COMMON-SAME: ptr [[VEC:%.*]], ptr [[TGT:%.*]], ptr [[TGT2:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*]]:
+; COMMON-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; COMMON-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[START]], i64 8) ]
+; COMMON-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 8
+; COMMON-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; COMMON-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; COMMON-NEXT: br label %[[LOOP_HEADER:.*]]
+; COMMON: [[LOOP_HEADER]]:
+; COMMON-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; COMMON-NEXT: [[L:%.*]] = load ptr, ptr [[PTR_IV]], align 8
+; COMMON-NEXT: [[C_1:%.*]] = icmp eq ptr [[L]], [[TGT]]
+; COMMON-NEXT: [[C_2:%.*]] = icmp eq ptr [[L]], [[TGT2]]
+; COMMON-NEXT: [[OR_COND:%.*]] = select i1 [[C_1]], i1 true, i1 [[C_2]]
+; COMMON-NEXT: br i1 [[OR_COND]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; COMMON: [[LOOP_LATCH]]:
+; COMMON-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 8
+; COMMON-NEXT: [[C_3:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; COMMON-NEXT: br i1 [[C_3]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; COMMON: [[EXIT]]:
+; COMMON-NEXT: [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; COMMON-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; COMMON-NEXT: [[C_4:%.*]] = icmp eq ptr [[RES]], [[END]]
+; COMMON-NEXT: ret i1 [[C_4]]
+;
+entry:
+ %start = load ptr, ptr %vec, align 8
+ call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 8) ]
+ %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 8
+ %end = load ptr, ptr %gep.end, align 8
+ call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 8) ]
+ br label %loop.header
+
+loop.header:
+ %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+ %l = load ptr, ptr %ptr.iv, align 8
+ %c.1 = icmp eq ptr %l, %tgt
+ br i1 %c.1, label %exit, label %then
+
+then:
+ %c.2 = icmp eq ptr %l, %tgt2
+ br i1 %c.2, label %exit, label %loop.latch
+
+loop.latch:
+ %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 8
+ %c.3 = icmp eq ptr %ptr.iv.next, %end
+ br i1 %c.3, label %exit, label %loop.header
+
+exit:
+ %res = phi ptr [ %ptr.iv, %loop.header ], [ %ptr.iv, %then], [ %end, %loop.latch ]
+ call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 8) ]
+ %c.4 = icmp eq ptr %res, %end
+ ret i1 %c.4
+}
+
+
+define i1 @multi_2_exiting_find_i8_loop_switch(ptr %vec, i8 %tgt) {
+; COMMON-LABEL: define i1 @multi_2_exiting_find_i8_loop_switch(
+; COMMON-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*]]:
+; COMMON-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; COMMON-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; COMMON-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; COMMON-NEXT: br label %[[LOOP_HEADER:.*]]
+; COMMON: [[LOOP_HEADER]]:
+; COMMON-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; COMMON-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; COMMON-NEXT: switch i8 [[L]], label %[[LOOP_LATCH]] [
+; COMMON-NEXT: i8 0, label %[[EXIT_1:.*]]
+; COMMON-NEXT: i8 1, label %[[EXIT_2:.*]]
+; COMMON-NEXT: i8 2, label %[[EXIT:.*]]
+; COMMON-NEXT: ]
+; COMMON: [[LOOP_LATCH]]:
+; COMMON-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; COMMON-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; COMMON-NEXT: br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; COMMON: [[EXIT]]:
+; COMMON-NEXT: [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; COMMON-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; COMMON-NEXT: ret i1 [[C_3]]
+; COMMON: [[EXIT_1]]:
+; COMMON-NEXT: ret i1 false
+; COMMON: [[EXIT_2]]:
+; COMMON-NEXT: ret i1 true
+;
+entry:
+ %start = load ptr, ptr %vec, align 8
+ %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 1
+ %end = load ptr, ptr %gep.end, align 8
+ br label %loop.header
+
+loop.header:
+ %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+ %l = load i8, ptr %ptr.iv, align 8
+ switch i8 %l, label %loop.latch [
+ i8 0, label %exit.1
+ i8 1, label %exit.2
+ i8 2, label %exit ]
+
+loop.latch:
+ %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 1
+ %c.2 = icmp eq ptr %ptr.iv.next, %end
+ br i1 %c.2, label %exit, label %loop.header
+
+exit:
+ %res = phi ptr [ %ptr.iv, %loop.header ], [ %end, %loop.latch ]
+ %c.3 = icmp eq ptr %res, %end
+ ret i1 %c.3
+
+exit.1:
+ ret i1 0
+
+exit.2:
+ ret i1 1
+}
+
+
+define i1 @multi_2_exiting_find_i8_loop_small_max_tc(ptr %vec, i8 %tgt, i5 %n5) {
+; COMMON-LABEL: define i1 @multi_2_exiting_find_i8_loop_small_max_tc(
+; COMMON-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]], i5 [[N5:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[C_0:%.*]] = icmp sgt i5 [[N5]], 0
+; COMMON-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; COMMON-NEXT: br i1 [[C_0]], label %[[LOOP_PH:.*]], label %[[EXIT:.*]]
+; COMMON: [[LOOP_PH]]:
+; COMMON-NEXT: [[N64:%.*]] = zext i5 [[N5]] to i64
+; COMMON-NEXT: [[END:%.*]] = getelementptr inbounds nuw i8, ptr [[START]], i64 [[N64]]
+; COMMON-NEXT: br label %[[LOOP_HEADER:.*]]
+; COMMON: [[LOOP_HEADER]]:
+; COMMON-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[LOOP_PH]] ]
+; COMMON-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH]] ], [ 0, %[[LOOP_PH]] ]
+; COMMON-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; COMMON-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; COMMON-NEXT: br i1 [[C_1]], label %[[LOOP_EXIT:.*]], label %[[LOOP_LATCH]]
+; COMMON: [[LOOP_LATCH]]:
+; COMMON-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; COMMON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; COMMON-NEXT: [[C_2:%.*]] = icmp eq i64 [[IV_NEXT]], [[N64]]
+; COMMON-NEXT: br i1 [[C_2]], label %[[LOOP_EXIT]], label %[[LOOP_HEADER]]
+; COMMON: [[LOOP_EXIT]]:
+; COMMON-NEXT: [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; COMMON-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; COMMON-NEXT: ret i1 [[C_3]]
+; COMMON: [[EXIT]]:
+; COMMON-NEXT: ret i1 false
+;
+entry:
+ %c.0 = icmp sgt i5 %n5, 0
+ %start = load ptr, ptr %vec, align 8
+ br i1 %c.0, label %loop.ph, label %exit
+
+loop.ph:
+ %n64 = zext i5 %n5 to i64
+ %end = getelementptr inbounds nuw i8, ptr %start, i64 %n64
+ br label %loop.header
+
+loop.header:
+ %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %loop.ph ]
+ %iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %loop.ph ]
+ %l = load i8, ptr %ptr.iv, align 8
+ %c.1 = icmp eq i8 %l, %tgt
+ br i1 %c.1, label %loop.exit, label %loop.latch
+
+loop.latch:
+ %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 1
+ %iv.next = add nuw nsw i64 %iv, 1
+ %c.2 = icmp eq i64 %iv.next, %n64
+ br i1 %c.2, label %loop.exit, label %loop.header
+
+loop.exit:
+ %res = phi ptr [ %ptr.iv, %loop.header ], [ %end, %loop.latch ]
+ %c.3 = icmp eq ptr %res, %end
+ ret i1 %c.3
+
+exit:
+ ret i1 false
+}
+
+
+define i1 @multi_2_exiting_find_i8_loop_invalid_insn(ptr %vec, i8 %tgt) #0 {
+; COMMON-LABEL: define i1 @multi_2_exiting_find_i8_loop_invalid_insn(
+; COMMON-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR1:[0-9]+]] {
+; COMMON-NEXT: [[ENTRY:.*]]:
+; COMMON-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; COMMON-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; COMMON-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; COMMON-NEXT: br label %[[LOOP_HEADER:.*]]
+; COMMON: [[LOOP_HEADER]]:
+; COMMON-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; COMMON-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; COMMON-NEXT: [[DEINTER:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> poison)
+; COMMON-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; COMMON-NEXT: br i1 [[C_1]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; COMMON: [[LOOP_LATCH]]:
+; COMMON-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; COMMON-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; COMMON-NEXT: br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; COMMON: [[EXIT]]:
+; COMMON-NEXT: [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; COMMON-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; COMMON-NEXT: ret i1 [[C_3]]
+;
+entry:
+ %start = load ptr, ptr %vec, align 8
+ %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 1
+ %end = load ptr, ptr %gep.end, align 8
+ br label %loop.header
+
+loop.header:
+ %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+ %l = load i8, ptr %ptr.iv, align 8
+ %deinter = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> poison)
+ %c.1 = icmp eq i8 %l, %tgt
+ br i1 %c.1, label %exit, label %loop.latch
+
+loop.latch:
+ %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 1
+ %c.2 = icmp eq ptr %ptr.iv.next, %end
+ br i1 %c.2, label %exit, label %loop.header
+
+exit:
+ %res = phi ptr [ %ptr.iv, %loop.header ], [ %end, %loop.latch ]
+ %c.3 = icmp eq ptr %res, %end
+ ret i1 %c.3
+}
+
+
+declare void @llvm.assume(i1 noundef)
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+
+attributes #0 = { "target-features"="-sve,-sve2" }
>From a79818c7942bc320872676eb3c0ed83364a10335 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 20 Mar 2025 13:31:31 +0000
Subject: [PATCH 2/5] Address review comment
---
.../AArch64/AArch64TargetTransformInfo.cpp | 95 ++++-----
.../AArch64/apple-unrolling-multi-exit.ll | 194 ++++++++++++++++++
2 files changed, 242 insertions(+), 47 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 38fcc3b9eab93..e80433b53df45 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4396,20 +4396,29 @@ static bool shouldUnrollLoopWithInstruction(Instruction &I,
return true;
}
-static bool shouldUnrollSmallMultiExitLoop(Loop *L, ScalarEvolution &SE,
- AArch64TTIImpl &TTI) {
- // Small search loops with multiple exits can be highly beneficial to unroll.
- // We only care about loops with exactly two exiting blocks, although each
- // block could jump to the same exit block.
- SmallVector<BasicBlock *> Blocks(L->getBlocks());
- if (Blocks.size() != 2 || L->getExitingBlock())
- return false;
+static InstructionCost getSizeOfLoop(Loop *L, AArch64TTIImpl &TTI) {
+ // Estimate the size of the loop.
+ InstructionCost Size = 0;
+ for (auto *BB : L->getBlocks()) {
+ for (auto &I : *BB) {
+ if (!shouldUnrollLoopWithInstruction(I, TTI))
+ return InstructionCost::getInvalid();
- if (any_of(Blocks, [](BasicBlock *BB) {
- return !isa<BranchInst>(BB->getTerminator());
- }))
- return false;
+ SmallVector<const Value *, 4> Operands(I.operand_values());
+ InstructionCost Cost =
+ TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
+ // This can happen with intrinsics that don't currently have a cost model
+ // or for some operations that require SVE.
+ if (!Cost.isValid())
+ return InstructionCost::getInvalid();
+ Size += *Cost.getValue();
+ }
+ }
+ return Size;
+}
+static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
+ AArch64TTIImpl &TTI) {
// Only consider loops with unknown trip counts for which we can determine
// a symbolic expression. Multi-exit loops with small known trip counts will
// likely be unrolled anyway.
@@ -4423,25 +4432,27 @@ static bool shouldUnrollSmallMultiExitLoop(Loop *L, ScalarEvolution &SE,
if (MaxTC > 0 && MaxTC <= 32)
return false;
+ if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
+ return false;
+
// Estimate the size of the loop.
- int64_t Size = 0;
- for (auto *BB : L->getBlocks()) {
- for (auto &I : *BB) {
- if (!shouldUnrollLoopWithInstruction(I, TTI))
- return false;
+ InstructionCost Size = getSizeOfLoop(L, TTI);
+ if (!Size.isValid())
+ return false;
- SmallVector<const Value *, 4> Operands(I.operand_values());
- InstructionCost Cost =
- TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
- // This can happen with intrinsics that don't currently have a cost model
- // or for some operations that require SVE.
- if (!Cost.isValid())
- return false;
- Size += *Cost.getValue();
- }
- }
+ // Small search loops with multiple exits can be highly beneficial to unroll.
+ // We only care about loops with exactly two exiting blocks, although each
+ // block could jump to the same exit block.
+ SmallVector<BasicBlock *> Blocks(L->getBlocks());
+ if (Blocks.size() != 2)
+ return false;
+
+ if (any_of(Blocks, [](BasicBlock *BB) {
+ return !isa<BranchInst>(BB->getTerminator());
+ }))
+ return false;
- return Size < 6;
+ return *Size.getValue() < 6;
}
/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
@@ -4477,24 +4488,9 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
}
}
- // Small search loops with multiple exits can be highly beneficial to unroll.
- if (!L->getExitBlock()) {
- if (L->getNumBlocks() == 2 && Size < 6 &&
- all_of(
- L->getBlocks(),
- [](BasicBlock *BB) {
- return isa<BranchInst>(BB->getTerminator());
- })) {
- UP.RuntimeUnrollMultiExit = true;
- UP.Runtime = true;
- // Limit unroll count.
- UP.DefaultUnrollRuntimeCount = 4;
- // Allow slightly more costly trip-count expansion to catch search loops
- // with pointer inductions.
- UP.SCEVExpansionBudget = 5;
- }
+ // This is handled by common code.
+ if (!L->getExitBlock())
return;
- }
if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L))
return;
@@ -4604,12 +4600,15 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
UP.PartialOptSizeThreshold = 0;
// Apply subtarget-specific unrolling preferences.
+ unsigned SmallMultiExitLoopUnrollFactor = SmallMultiExitLoopUF;
switch (ST->getProcFamily()) {
case AArch64Subtarget::AppleA14:
case AArch64Subtarget::AppleA15:
case AArch64Subtarget::AppleA16:
case AArch64Subtarget::AppleM4:
getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
+ if (!SmallMultiExitLoopUF.getNumOccurrences())
+ SmallMultiExitLoopUnrollFactor = 4;
break;
case AArch64Subtarget::Falkor:
if (EnableFalkorHWPFUnrollFix)
@@ -4619,14 +4618,16 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
break;
}
- if (SmallMultiExitLoopUF && shouldUnrollSmallMultiExitLoop(L, SE, *this)) {
+ if (!L->getExitBlock() && SmallMultiExitLoopUnrollFactor &&
+ shouldUnrollMultiExitLoop(L, SE, *this)) {
UP.RuntimeUnrollMultiExit = true;
UP.Runtime = true;
// Limit unroll count.
- UP.DefaultUnrollRuntimeCount = SmallMultiExitLoopUF;
+ UP.DefaultUnrollRuntimeCount = SmallMultiExitLoopUnrollFactor;
// Allow slightly more costly trip-count expansion to catch search loops
// with pointer inductions.
UP.SCEVExpansionBudget = 5;
+ return;
}
// Scan the loop: don't unroll loops with calls as this could prevent
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll
index 31b23eae0f866..5177a35f8d36b 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll
@@ -3,6 +3,7 @@
; RUN: opt -p loop-unroll -mcpu=apple-m2 -S %s | FileCheck --check-prefix=APPLE %s
; RUN: opt -p loop-unroll -mcpu=apple-m3 -S %s | FileCheck --check-prefix=APPLE %s
; RUN: opt -p loop-unroll -mcpu=apple-m4 -S %s | FileCheck --check-prefix=APPLE %s
+; RUN: opt -p loop-unroll -mcpu=apple-m4 -small-multi-exit-loop-unroll-factor=2 -S %s | FileCheck --check-prefix=UNROLL2 %s
; RUN: opt -p loop-unroll -mcpu=cortex-a57 -S %s | FileCheck --check-prefix=OTHER %s
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
@@ -86,6 +87,61 @@ define i1 @multi_2_exit_find_i8_loop(ptr %vec, i8 %tgt) {
; APPLE-NEXT: [[C_5:%.*]] = icmp eq ptr [[RES1]], [[END]]
; APPLE-NEXT: ret i1 [[C_5]]
;
+; UNROLL2-LABEL: define i1 @multi_2_exit_find_i8_loop(
+; UNROLL2-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0:[0-9]+]] {
+; UNROLL2-NEXT: [[ENTRY:.*]]:
+; UNROLL2-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; UNROLL2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; UNROLL2-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; UNROLL2-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; UNROLL2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; UNROLL2-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]]
+; UNROLL2-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]]
+; UNROLL2-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], -1
+; UNROLL2-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP1]], 1
+; UNROLL2-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; UNROLL2-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_HEADER_PROL_PREHEADER:.*]], label %[[LOOP_HEADER_PROL_LOOPEXIT:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL_PREHEADER]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL]]:
+; UNROLL2-NEXT: [[L_PROL:%.*]] = load i8, ptr [[START]], align 8
+; UNROLL2-NEXT: [[C_1_PROL:%.*]] = icmp eq i8 [[L_PROL]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_PROL]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP_LATCH_PROL:.*]]
+; UNROLL2: [[LOOP_LATCH_PROL]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[START]], i64 1
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL_LOOPEXIT]]
+; UNROLL2: [[LOOP_HEADER_PROL_LOOPEXIT]]:
+; UNROLL2-NEXT: [[RES_UNR:%.*]] = phi ptr [ poison, %[[ENTRY]] ], [ [[END]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT: [[PTR_IV_UNR:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT_PROL]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 1
+; UNROLL2-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[ENTRY_NEW:.*]]
+; UNROLL2: [[ENTRY_NEW]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER:.*]]
+; UNROLL2: [[LOOP_HEADER]]:
+; UNROLL2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_UNR]], %[[ENTRY_NEW]] ], [ [[PTR_IV_NEXT_1:%.*]], %[[LOOP_LATCH_1:.*]] ]
+; UNROLL2-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; UNROLL2-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP_LATCH:.*]]
+; UNROLL2: [[LOOP_LATCH]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; UNROLL2-NEXT: [[L_1:%.*]] = load i8, ptr [[PTR_IV_NEXT]], align 8
+; UNROLL2-NEXT: [[C_1_1:%.*]] = icmp eq i8 [[L_1]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_LATCH_1]]
+; UNROLL2: [[LOOP_LATCH_1]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_1]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT]], i64 1
+; UNROLL2-NEXT: [[C_2_1:%.*]] = icmp eq ptr [[PTR_IV_NEXT_1]], [[END]]
+; UNROLL2-NEXT: br i1 [[C_2_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_HEADER]]
+; UNROLL2: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; UNROLL2-NEXT: [[RES_PH_PH:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[PTR_IV_NEXT]], %[[LOOP_LATCH]] ], [ [[END]], %[[LOOP_LATCH_1]] ]
+; UNROLL2-NEXT: br label %[[EXIT_UNR_LCSSA]]
+; UNROLL2: [[EXIT_UNR_LCSSA]]:
+; UNROLL2-NEXT: [[RES_PH:%.*]] = phi ptr [ [[START]], %[[LOOP_HEADER_PROL]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; UNROLL2-NEXT: br label %[[EXIT]]
+; UNROLL2: [[EXIT]]:
+; UNROLL2-NEXT: [[RES:%.*]] = phi ptr [ [[RES_UNR]], %[[LOOP_HEADER_PROL_LOOPEXIT]] ], [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ]
+; UNROLL2-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; UNROLL2-NEXT: ret i1 [[C_3]]
+;
; OTHER-LABEL: define i1 @multi_2_exit_find_i8_loop(
; OTHER-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0:[0-9]+]] {
; OTHER-NEXT: [[ENTRY:.*]]:
@@ -215,6 +271,67 @@ define i1 @multi_2_exit_find_ptr_loop(ptr %vec, ptr %tgt) {
; APPLE-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
; APPLE-NEXT: ret i1 [[C_3]]
;
+; UNROLL2-LABEL: define i1 @multi_2_exit_find_ptr_loop(
+; UNROLL2-SAME: ptr [[VEC:%.*]], ptr [[TGT:%.*]]) #[[ATTR0]] {
+; UNROLL2-NEXT: [[ENTRY:.*]]:
+; UNROLL2-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; UNROLL2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; UNROLL2-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[START]], i64 8) ]
+; UNROLL2-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 8
+; UNROLL2-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; UNROLL2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; UNROLL2-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; UNROLL2-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8
+; UNROLL2-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; UNROLL2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
+; UNROLL2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; UNROLL2-NEXT: [[TMP4:%.*]] = freeze i64 [[TMP3]]
+; UNROLL2-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], -1
+; UNROLL2-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP4]], 1
+; UNROLL2-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; UNROLL2-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_HEADER_PROL_PREHEADER:.*]], label %[[LOOP_HEADER_PROL_LOOPEXIT:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL_PREHEADER]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL]]:
+; UNROLL2-NEXT: [[L_PROL:%.*]] = load ptr, ptr [[START]], align 8
+; UNROLL2-NEXT: [[C_1_PROL:%.*]] = icmp eq ptr [[L_PROL]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_PROL]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP_LATCH_PROL:.*]]
+; UNROLL2: [[LOOP_LATCH_PROL]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[START]], i64 8
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL_LOOPEXIT]]
+; UNROLL2: [[LOOP_HEADER_PROL_LOOPEXIT]]:
+; UNROLL2-NEXT: [[RES_UNR:%.*]] = phi ptr [ poison, %[[ENTRY]] ], [ [[END]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT: [[PTR_IV_UNR:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT_PROL]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 1
+; UNROLL2-NEXT: br i1 [[TMP6]], label %[[EXIT:.*]], label %[[ENTRY_NEW:.*]]
+; UNROLL2: [[ENTRY_NEW]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER:.*]]
+; UNROLL2: [[LOOP_HEADER]]:
+; UNROLL2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_UNR]], %[[ENTRY_NEW]] ], [ [[PTR_IV_NEXT_1:%.*]], %[[LOOP_LATCH_1:.*]] ]
+; UNROLL2-NEXT: [[L:%.*]] = load ptr, ptr [[PTR_IV]], align 8
+; UNROLL2-NEXT: [[C_1:%.*]] = icmp eq ptr [[L]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP_LATCH:.*]]
+; UNROLL2: [[LOOP_LATCH]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 8
+; UNROLL2-NEXT: [[L_1:%.*]] = load ptr, ptr [[PTR_IV_NEXT]], align 8
+; UNROLL2-NEXT: [[C_1_1:%.*]] = icmp eq ptr [[L_1]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_LATCH_1]]
+; UNROLL2: [[LOOP_LATCH_1]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_1]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT]], i64 8
+; UNROLL2-NEXT: [[C_2_1:%.*]] = icmp eq ptr [[PTR_IV_NEXT_1]], [[END]]
+; UNROLL2-NEXT: br i1 [[C_2_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_HEADER]]
+; UNROLL2: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; UNROLL2-NEXT: [[RES_PH_PH:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[PTR_IV_NEXT]], %[[LOOP_LATCH]] ], [ [[END]], %[[LOOP_LATCH_1]] ]
+; UNROLL2-NEXT: br label %[[EXIT_UNR_LCSSA]]
+; UNROLL2: [[EXIT_UNR_LCSSA]]:
+; UNROLL2-NEXT: [[RES_PH:%.*]] = phi ptr [ [[START]], %[[LOOP_HEADER_PROL]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; UNROLL2-NEXT: br label %[[EXIT]]
+; UNROLL2: [[EXIT]]:
+; UNROLL2-NEXT: [[RES:%.*]] = phi ptr [ [[RES_UNR]], %[[LOOP_HEADER_PROL_LOOPEXIT]] ], [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ]
+; UNROLL2-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; UNROLL2-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; UNROLL2-NEXT: ret i1 [[C_3]]
+;
; OTHER-LABEL: define i1 @multi_2_exit_find_ptr_loop(
; OTHER-SAME: ptr [[VEC:%.*]], ptr [[TGT:%.*]]) #[[ATTR0]] {
; OTHER-NEXT: [[ENTRY:.*]]:
@@ -289,6 +406,29 @@ define i1 @multi_2_exit_find_i8_loop_too_large(ptr %vec, i8 %tgt) {
; APPLE-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
; APPLE-NEXT: ret i1 [[C_3]]
;
+; UNROLL2-LABEL: define i1 @multi_2_exit_find_i8_loop_too_large(
+; UNROLL2-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0]] {
+; UNROLL2-NEXT: [[ENTRY:.*]]:
+; UNROLL2-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; UNROLL2-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; UNROLL2-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; UNROLL2-NEXT: br label %[[LOOP_HEADER:.*]]
+; UNROLL2: [[LOOP_HEADER]]:
+; UNROLL2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; UNROLL2-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; UNROLL2-NEXT: [[UDIV:%.*]] = udiv i8 [[L]], [[TGT]]
+; UNROLL2-NEXT: [[UDIV_2:%.*]] = udiv i8 [[UDIV]], 10
+; UNROLL2-NEXT: [[C_1:%.*]] = icmp eq i8 [[UDIV_2]], 2
+; UNROLL2-NEXT: br i1 [[C_1]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; UNROLL2: [[LOOP_LATCH]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; UNROLL2-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; UNROLL2-NEXT: br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; UNROLL2: [[EXIT]]:
+; UNROLL2-NEXT: [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; UNROLL2-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; UNROLL2-NEXT: ret i1 [[C_3]]
+;
; OTHER-LABEL: define i1 @multi_2_exit_find_i8_loop_too_large(
; OTHER-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0]] {
; OTHER-NEXT: [[ENTRY:.*]]:
@@ -363,6 +503,32 @@ define i1 @multi_3_exit_find_ptr_loop(ptr %vec, ptr %tgt, ptr %tgt2) {
; APPLE-NEXT: [[C_4:%.*]] = icmp eq ptr [[RES]], [[END]]
; APPLE-NEXT: ret i1 [[C_4]]
;
+; UNROLL2-LABEL: define i1 @multi_3_exit_find_ptr_loop(
+; UNROLL2-SAME: ptr [[VEC:%.*]], ptr [[TGT:%.*]], ptr [[TGT2:%.*]]) #[[ATTR0]] {
+; UNROLL2-NEXT: [[ENTRY:.*]]:
+; UNROLL2-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; UNROLL2-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[START]], i64 8) ]
+; UNROLL2-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 8
+; UNROLL2-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; UNROLL2-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; UNROLL2-NEXT: br label %[[LOOP_HEADER:.*]]
+; UNROLL2: [[LOOP_HEADER]]:
+; UNROLL2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; UNROLL2-NEXT: [[L:%.*]] = load ptr, ptr [[PTR_IV]], align 8
+; UNROLL2-NEXT: [[C_1:%.*]] = icmp eq ptr [[L]], [[TGT]]
+; UNROLL2-NEXT: [[C_2:%.*]] = icmp eq ptr [[L]], [[TGT2]]
+; UNROLL2-NEXT: [[OR_COND:%.*]] = select i1 [[C_1]], i1 true, i1 [[C_2]]
+; UNROLL2-NEXT: br i1 [[OR_COND]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; UNROLL2: [[LOOP_LATCH]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 8
+; UNROLL2-NEXT: [[C_3:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; UNROLL2-NEXT: br i1 [[C_3]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; UNROLL2: [[EXIT]]:
+; UNROLL2-NEXT: [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; UNROLL2-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; UNROLL2-NEXT: [[C_4:%.*]] = icmp eq ptr [[RES]], [[END]]
+; UNROLL2-NEXT: ret i1 [[C_4]]
+;
; OTHER-LABEL: define i1 @multi_3_exit_find_ptr_loop(
; OTHER-SAME: ptr [[VEC:%.*]], ptr [[TGT:%.*]], ptr [[TGT2:%.*]]) #[[ATTR0]] {
; OTHER-NEXT: [[ENTRY:.*]]:
@@ -448,6 +614,34 @@ define i1 @multi_3_exit_find_i8_loop_switch(ptr %vec, i8 %tgt) {
; APPLE: [[EXIT_2]]:
; APPLE-NEXT: ret i1 true
;
+; UNROLL2-LABEL: define i1 @multi_3_exit_find_i8_loop_switch(
+; UNROLL2-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0]] {
+; UNROLL2-NEXT: [[ENTRY:.*]]:
+; UNROLL2-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; UNROLL2-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; UNROLL2-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; UNROLL2-NEXT: br label %[[LOOP_HEADER:.*]]
+; UNROLL2: [[LOOP_HEADER]]:
+; UNROLL2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; UNROLL2-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; UNROLL2-NEXT: switch i8 [[L]], label %[[LOOP_LATCH]] [
+; UNROLL2-NEXT: i8 0, label %[[EXIT_1:.*]]
+; UNROLL2-NEXT: i8 1, label %[[EXIT_2:.*]]
+; UNROLL2-NEXT: i8 2, label %[[EXIT:.*]]
+; UNROLL2-NEXT: ]
+; UNROLL2: [[LOOP_LATCH]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; UNROLL2-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; UNROLL2-NEXT: br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; UNROLL2: [[EXIT]]:
+; UNROLL2-NEXT: [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; UNROLL2-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; UNROLL2-NEXT: ret i1 [[C_3]]
+; UNROLL2: [[EXIT_1]]:
+; UNROLL2-NEXT: ret i1 false
+; UNROLL2: [[EXIT_2]]:
+; UNROLL2-NEXT: ret i1 true
+;
; OTHER-LABEL: define i1 @multi_3_exit_find_i8_loop_switch(
; OTHER-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0]] {
; OTHER-NEXT: [[ENTRY:.*]]:
>From e3e6e9b7acd19c3ed8a67a44e4e861e13d4c2d34 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 27 Mar 2025 13:17:15 +0000
Subject: [PATCH 3/5] Address review comment
---
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e80433b53df45..d391beff0a3e4 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4443,7 +4443,7 @@ static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
// Small search loops with multiple exits can be highly beneficial to unroll.
// We only care about loops with exactly two exiting blocks, although each
// block could jump to the same exit block.
- SmallVector<BasicBlock *> Blocks(L->getBlocks());
+ ArrayRef<BasicBlock*> Blocks = L->getBlocks();
if (Blocks.size() != 2)
return false;
>From b1b95edf09a8ac9d2325e73505697a845dd2b4e0 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 27 Mar 2025 13:24:11 +0000
Subject: [PATCH 4/5] Fix up code formatting
---
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d391beff0a3e4..ef546d9272ee5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4443,7 +4443,7 @@ static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
// Small search loops with multiple exits can be highly beneficial to unroll.
// We only care about loops with exactly two exiting blocks, although each
// block could jump to the same exit block.
- ArrayRef<BasicBlock*> Blocks = L->getBlocks();
+ ArrayRef<BasicBlock *> Blocks = L->getBlocks();
if (Blocks.size() != 2)
return false;
>From fb755c0b2a2e7ec5bb6a44b2878c064c9f642865 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 31 Mar 2025 13:39:47 +0000
Subject: [PATCH 5/5] Address review comments
---
.../AArch64/AArch64TargetTransformInfo.cpp | 65 +++++++++----------
.../AArch64/apple-unrolling-multi-exit.ll | 2 +-
.../AArch64/unrolling-multi-exit.ll | 2 +-
3 files changed, 32 insertions(+), 37 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ef546d9272ee5..5f4be644c8658 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -69,7 +69,7 @@ static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
cl::init(true), cl::Hidden);
static cl::opt<unsigned> SmallMultiExitLoopUF(
- "small-multi-exit-loop-unroll-factor", cl::init(0), cl::Hidden,
+ "aarch64-small-multi-exit-loop-unroll", cl::init(0), cl::Hidden,
cl::desc(
"Force unrolling of small multi-exit loops with given unroll factor"));
@@ -4386,23 +4386,29 @@ static bool shouldUnrollLoopWithInstruction(Instruction &I,
if (I.getType()->isVectorTy())
return false;
- if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
- if (const Function *F = cast<CallBase>(I).getCalledFunction())
- if (!TTI.isLoweredToCall(F))
- return true;
+ if (isa<CallBase>(I)) {
+ if (isa<CallInst>(I) || isa<InvokeInst>(I))
+ if (const Function *F = cast<CallBase>(I).getCalledFunction())
+ if (!TTI.isLoweredToCall(F))
+ return true;
return false;
}
return true;
}
-static InstructionCost getSizeOfLoop(Loop *L, AArch64TTIImpl &TTI) {
+static unsigned getLoopSize(Loop *L, AArch64TTIImpl &TTI,
+ InstructionCost Budget) {
// Estimate the size of the loop.
- InstructionCost Size = 0;
+ InstructionCost LoopCost = 0;
+
+ if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
+ return 0;
+
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
if (!shouldUnrollLoopWithInstruction(I, TTI))
- return InstructionCost::getInvalid();
+ return 0;
SmallVector<const Value *, 4> Operands(I.operand_values());
InstructionCost Cost =
@@ -4410,11 +4416,15 @@ static InstructionCost getSizeOfLoop(Loop *L, AArch64TTIImpl &TTI) {
// This can happen with intrinsics that don't currently have a cost model
// or for some operations that require SVE.
if (!Cost.isValid())
- return InstructionCost::getInvalid();
- Size += *Cost.getValue();
+ return 0;
+
+ LoopCost += Cost;
+ if (LoopCost > Budget)
+ return 0;
}
}
- return Size;
+
+ return *LoopCost.getValue();
}
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
@@ -4432,12 +4442,8 @@ static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
if (MaxTC > 0 && MaxTC <= 32)
return false;
- if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
- return false;
-
// Estimate the size of the loop.
- InstructionCost Size = getSizeOfLoop(L, TTI);
- if (!Size.isValid())
+ if (!getLoopSize(L, TTI, 5))
return false;
// Small search loops with multiple exits can be highly beneficial to unroll.
@@ -4452,7 +4458,7 @@ static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
}))
return false;
- return *Size.getValue() < 6;
+ return true;
}
/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
@@ -4469,28 +4475,15 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
if (!L->isInnermost() || L->getNumBlocks() > 8)
return;
+ // This is handled by common code.
+ if (!L->getExitBlock())
+ return;
+
const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
(SE.getSmallConstantMaxTripCount(L) > 0 &&
SE.getSmallConstantMaxTripCount(L) <= 32))
return;
- if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
- return;
-
- int64_t Size = 0;
- for (auto *BB : L->getBlocks()) {
- for (auto &I : *BB) {
- if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
- return;
- SmallVector<const Value *, 4> Operands(I.operand_values());
- Size +=
- *TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue();
- }
- }
-
- // This is handled by common code.
- if (!L->getExitBlock())
- return;
if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L))
return;
@@ -4502,7 +4495,9 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
// dependencies, to expose more parallel memory access streams.
BasicBlock *Header = L->getHeader();
if (Header == L->getLoopLatch()) {
- if (Size > 8)
+ // Estimate the size of the loop.
+ unsigned Size = getLoopSize(L, TTI, 8);
+ if (!Size)
return;
SmallPtrSet<Value *, 8> LoadedValues;
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll
index 5177a35f8d36b..51e992f55b487 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll
@@ -3,7 +3,7 @@
; RUN: opt -p loop-unroll -mcpu=apple-m2 -S %s | FileCheck --check-prefix=APPLE %s
; RUN: opt -p loop-unroll -mcpu=apple-m3 -S %s | FileCheck --check-prefix=APPLE %s
; RUN: opt -p loop-unroll -mcpu=apple-m4 -S %s | FileCheck --check-prefix=APPLE %s
-; RUN: opt -p loop-unroll -mcpu=apple-m4 -small-multi-exit-loop-unroll-factor=2 -S %s | FileCheck --check-prefix=UNROLL2 %s
+; RUN: opt -p loop-unroll -mcpu=apple-m4 -aarch64-small-multi-exit-loop-unroll=2 -S %s | FileCheck --check-prefix=UNROLL2 %s
; RUN: opt -p loop-unroll -mcpu=cortex-a57 -S %s | FileCheck --check-prefix=OTHER %s
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll b/llvm/test/Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll
index b799b4328400a..9a609450f8636 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-unroll -mcpu=generic -small-multi-exit-loop-unroll-factor=2 -S %s | FileCheck --check-prefixes=COMMON,UNROLL2 %s
+; RUN: opt -p loop-unroll -mcpu=generic -aarch64-small-multi-exit-loop-unroll=2 -S %s | FileCheck --check-prefixes=COMMON,UNROLL2 %s
; RUN: opt -p loop-unroll -mcpu=generic -S %s | FileCheck --check-prefixes=COMMON,GENERIC %s
target triple = "aarch64-linux-gnu"
More information about the llvm-commits
mailing list