[llvm] [VPlan] Support arbitrary predicated early exits. (PR #182396)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 19 14:56:33 PST 2026
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/182396
>From ea867d3891d5a75c1dfc3ce6ec2a689bf939e811 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 15 Feb 2026 13:03:47 +0000
Subject: [PATCH 1/2] [VPlan] Supported conditionally executed single early
exits.
Add support for a single early exit that is executed conditionally.
To make sure the mask from any non-exiting control flow is combined with
the early exit condition.
To do so, introduce a MaskedCond VPInstruction, which is inserted as
user of the early-exit condition, at the point of the early-exit branch.
The VPInstruction will get masked automatically if needed by the
predicator, ensuring that we properly account for it when checking
whether the early exit has been taken.
Note that this does not allow for instructions that require predication
after the early exit. This requires additional work in progress:
https://github.com/llvm/llvm-project/pull/172454
As an alternative to MaskedCond, we could also predicate before handling
early exiting blocks: https://github.com/llvm/llvm-project/pull/181830
---
.../Vectorize/LoopVectorizationLegality.cpp | 4 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 6 +-
.../Transforms/Vectorize/VPlanAnalysis.cpp | 3 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 5 +
.../Transforms/Vectorize/VPlanTransforms.cpp | 22 ++-
.../LoopVectorize/early_exit_legality.ll | 38 +----
.../early_exit_store_legality.ll | 2 +-
.../LoopVectorize/predicated-single-exit.ll | 157 +++++++++++-------
.../single_early_exit_live_outs.ll | 2 +-
9 files changed, 138 insertions(+), 101 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index e57e0cf636501..67e195488850e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1765,8 +1765,10 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
}
}
+ // For predicated early exits, we only support a single early exit for now.
BasicBlock *LatchPredBB = LatchBB->getUniquePredecessor();
- if (LatchPredBB != UncountableExitingBlocks.back()) {
+ if (UncountableExitingBlocks.size() != 1 &&
+ LatchPredBB != UncountableExitingBlocks.back()) {
reportVectorizationFailure(
"Last early exiting block in the chain is not the latch predecessor",
"Cannot vectorize early exit loop", "EarlyExitNotLatchPredecessor", ORE,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a0c23df0b3c38..7863a5d955d3b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1270,7 +1270,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
/// backedge value). Takes the wide induction recipe and the original
/// backedge value as operands.
ExitingIVValue,
- OpsEnd = ExitingIVValue,
+ MaskedCond,
+ OpsEnd = MaskedCond,
};
/// Returns true if this VPInstruction generates scalar values for all lanes.
@@ -1304,6 +1305,9 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
/// Returns true if the VPInstruction does not need masking.
bool alwaysUnmasked() const {
+ if (Opcode == VPInstruction::MaskedCond)
+ return false;
+
// For now only VPInstructions with underlying values use masks.
// TODO: provide masks to VPInstructions w/o underlying values.
if (!getUnderlyingValue())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 4b744b9128171..998e48d411f50 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -129,6 +129,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
inferScalarType(R->getOperand(1))->isIntegerTy(1) &&
"LogicalAnd/Or operands should be bool");
return IntegerType::get(Ctx, 1);
+ case VPInstruction::MaskedCond:
+ assert(inferScalarType(R->getOperand(0))->isIntegerTy(1));
+ return IntegerType::get(Ctx, 1);
case VPInstruction::BranchOnCond:
case VPInstruction::BranchOnTwoConds:
case VPInstruction::BranchOnCount:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 33cb1509565d5..5fd24fbedbe57 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -457,6 +457,7 @@ unsigned VPInstruction::getNumOperandsForOpcode() const {
case VPInstruction::ExtractLastLane:
case VPInstruction::ExtractLastPart:
case VPInstruction::ExtractPenultimateElement:
+ case VPInstruction::MaskedCond:
case VPInstruction::Not:
case VPInstruction::ResumeForEpilogue:
case VPInstruction::Reverse:
@@ -1345,6 +1346,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::LogicalOr:
+ case VPInstruction::MaskedCond:
case VPInstruction::Not:
case VPInstruction::PtrAdd:
case VPInstruction::WideIVStep:
@@ -1491,6 +1493,9 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExitingIVValue:
O << "exiting-iv-value";
break;
+ case VPInstruction::MaskedCond:
+ O << "masked-cond";
+ break;
case VPInstruction::ExtractLane:
O << "extract-lane";
break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 22a8edaf30eb6..e86fdd781252b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1438,6 +1438,15 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
}
}
+ if (match(Def, m_VPInstruction<VPInstruction::MaskedCond>())) {
+ if (Def->getNumOperands() == 2) {
+ VPValue *And = Builder.createNaryOp(
+ VPInstruction::LogicalAnd, {Def->getOperand(0), Def->getOperand(1)});
+ return Def->replaceAllUsesWith(And);
+ }
+ return Def->replaceAllUsesWith(Def->getOperand(0));
+ }
+
// Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
// This is useful for fmax/fmin without fast-math flags, where we need to
// check if any operand is NaN.
@@ -4080,10 +4089,17 @@ void VPlanTransforms::handleUncountableEarlyExits(VPlan &Plan,
match(EarlyExitingVPBB->getTerminator(),
m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
assert(Matched && "Terminator must be BranchOnCond");
- auto *CondToEarlyExit = TrueSucc == ExitBlock
- ? CondOfEarlyExitingVPBB
- : Builder.createNot(CondOfEarlyExitingVPBB);
+
+ // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
+ // the correct block mask.
+ VPBuilder EarlyExitBuilder(EarlyExitingVPBB->getTerminator());
+ auto *CondToEarlyExit = EarlyExitBuilder.createNaryOp(
+ VPInstruction::MaskedCond,
+ TrueSucc == ExitBlock
+ ? CondOfEarlyExitingVPBB
+ : EarlyExitBuilder.createNot(CondOfEarlyExitingVPBB));
assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
+ !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
VPDT.properlyDominates(
CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
LatchVPBB)) &&
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
index 2b68a4787b15a..03b66d372c5f9 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
@@ -318,7 +318,7 @@ return:
; support this yet.
define i64 @uncountable_exit_on_last_block() {
; CHECK-LABEL: LV: Checking a loop in 'uncountable_exit_on_last_block'
-; CHECK: LV: Not vectorizing: Last early exiting block in the chain is not the latch predecessor.
+; CHECK: LV: Not vectorizing: Cannot determine exact exit count for latch block.
entry:
%p1 = alloca [1024 x i8]
%p2 = alloca [1024 x i8]
@@ -492,42 +492,6 @@ exit: ; preds = %for.body
ret void
}
-define i64 @uncountable_exit_in_conditional_block(ptr %mask) {
-; CHECK-LABEL: LV: Checking a loop in 'uncountable_exit_in_conditional_block'
-; CHECK: LV: Not vectorizing: Last early exiting block in the chain is not the latch predecessor.
-entry:
- %p1 = alloca [1024 x i8]
- %p2 = alloca [1024 x i8]
- call void @init_mem(ptr %p1, i64 1024)
- call void @init_mem(ptr %p2, i64 1024)
- br label %loop
-
-loop:
- %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
- %arrayidx1 = getelementptr inbounds i8, ptr %mask, i64 %index
- %ld1 = load i8, ptr %arrayidx1, align 1
- %cmp1 = icmp ne i8 %ld1, 0
- br i1 %cmp1, label %loop.search, label %loop.inc
-
-loop.search:
- %arrayidx2 = getelementptr inbounds i8, ptr %p1, i64 %index
- %ld2 = load i8, ptr %arrayidx2, align 1
- %arrayidx3 = getelementptr inbounds i8, ptr %p2, i64 %index
- %ld3 = load i8, ptr %arrayidx3, align 1
- %cmp2 = icmp eq i8 %ld2, %ld3
- br i1 %cmp2, label %loop.inc, label %loop.end
-
-loop.inc:
- %index.next = add i64 %index, 1
- %exitcond = icmp ne i64 %index.next, 67
- br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
- %retval = phi i64 [ %index, %loop.search ], [ 67, %loop.inc ]
- ret i64 %retval
-}
-
-
define i64 @same_exit_block_pre_inc_use1_with_reduction() {
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_with_reduction'
; CHECK: LV: Not vectorizing: Found reductions or recurrences in early-exit loop.
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
index 55b52299d4331..bbdbd646cc2b3 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
@@ -461,7 +461,7 @@ exit:
define void @loop_contains_store_uncounted_exit_is_not_guaranteed_to_execute(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) {
; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_uncounted_exit_is_not_guaranteed_to_execute'
-; CHECK: LV: Not vectorizing: Last early exiting block in the chain is not the latch predecessor.
+; CHECK: LV: Not vectorizing: Load for uncountable exit not guaranteed to execute.
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/predicated-single-exit.ll b/llvm/test/Transforms/LoopVectorize/predicated-single-exit.ll
index 7e8baf9d7e621..f26e940772532 100644
--- a/llvm/test/Transforms/LoopVectorize/predicated-single-exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/predicated-single-exit.ll
@@ -7,25 +7,34 @@
define i64 @single_exit_in_conditional_block() {
; CHECK-LABEL: define i64 @single_exit_in_conditional_block() {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
-; CHECK-NEXT: [[L_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT: [[BRANCH_COND:%.*]] = icmp slt i8 [[L_A]], 0
-; CHECK-NEXT: br i1 [[BRANCH_COND]], label %[[BLOCK_A:.*]], label %[[LOOP_LATCH]]
+; CHECK-NEXT: br label %[[BLOCK_A:.*]]
; CHECK: [[BLOCK_A]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr @B, i64 [[IV]]
-; CHECK-NEXT: [[L_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[L_A]], [[L_B]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[GEP_B]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT: [[CMP:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_END:.*]], label %[[LOOP_LATCH]]
; CHECK: [[LOOP_LATCH]]:
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 64
-; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[LOOP_END]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[BLOCK_A]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[LOOP_END1:.*]]
; CHECK: [[LOOP_END]]:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[IV]], %[[BLOCK_A]] ], [ -1, %[[LOOP_LATCH]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 false)
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[IV]], [[TMP8]]
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[LOOP_END1]]:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[TMP9]], %[[LOOP_END]] ], [ -1, %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -113,27 +122,34 @@ loop.end:
define i64 @single_exit_in_conditional_block2() {
; CHECK-LABEL: define i64 @single_exit_in_conditional_block2() {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
-; CHECK-NEXT: [[L_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT: [[BRANCH_COND:%.*]] = icmp slt i8 [[L_A]], 0
-; CHECK-NEXT: br i1 [[BRANCH_COND]], label %[[BLOCK_A:.*]], label %[[MERGE:.*]]
+; CHECK-NEXT: br label %[[BLOCK_A:.*]]
; CHECK: [[BLOCK_A]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[MERGE:.*]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr @B, i64 [[IV]]
-; CHECK-NEXT: [[L_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[L_A]], [[L_B]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[GEP_B]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT: [[CMP:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_END:.*]], label %[[MERGE]]
; CHECK: [[MERGE]]:
+; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[BLOCK_A]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[LOOP_LATCH:.*]]
+; CHECK: [[LOOP_END]]:
+; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 false)
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[IV]], [[TMP8]]
; CHECK-NEXT: br label %[[LOOP_LATCH]]
; CHECK: [[LOOP_LATCH]]:
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 64
-; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[LOOP_END]]
-; CHECK: [[LOOP_END]]:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[IV]], %[[BLOCK_A]] ], [ -1, %[[LOOP_LATCH]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[TMP9]], %[[LOOP_END]] ], [ -1, %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -168,23 +184,30 @@ loop.end:
; Variant: exit condition defined in header but used in conditional block.
define i64 @exit_cond_defined_in_header() {
; CHECK-LABEL: define i64 @exit_cond_defined_in_header() {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
-; CHECK-NEXT: [[L_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT: [[BRANCH_COND:%.*]] = icmp slt i8 [[L_A]], 0
-; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i8 [[L_A]], 1
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[GEP_A]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], splat (i8 1)
+; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-NEXT: [[BRANCH_COND:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
; CHECK-NEXT: br i1 [[BRANCH_COND]], label %[[BLOCK_A:.*]], label %[[LOOP_LATCH]]
-; CHECK: [[BLOCK_A]]:
-; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[LOOP_END:.*]], label %[[LOOP_LATCH]]
; CHECK: [[LOOP_LATCH]]:
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 64
-; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[LOOP_END]]
+; CHECK-NEXT: br i1 [[TMP6]], label %[[LOOP_END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[LOOP_END]]:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 10, %[[BLOCK_A]] ], [ [[IV]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT: br label %[[LOOP_END1:.*]]
+; CHECK: [[BLOCK_A]]:
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[LOOP_END1]]:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 10, %[[BLOCK_A]] ], [ 63, %[[LOOP_END]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -214,22 +237,33 @@ loop.end:
define i64 @livein_exit_cond_in_conditional(i1 %exit.cond) {
; CHECK-LABEL: define i64 @livein_exit_cond_in_conditional(
; CHECK-SAME: i1 [[EXIT_COND:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[EXIT_COND]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
-; CHECK-NEXT: [[L_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT: [[BRANCH_COND:%.*]] = icmp slt i8 [[L_A]], 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[GEP_A]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT: [[BRANCH_COND:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
; CHECK-NEXT: br i1 [[BRANCH_COND]], label %[[BLOCK_A:.*]], label %[[LOOP_LATCH]]
-; CHECK: [[BLOCK_A]]:
-; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[LOOP_END:.*]], label %[[LOOP_LATCH]]
; CHECK: [[LOOP_LATCH]]:
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 64
-; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[LOOP_END]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[LOOP_END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[LOOP_END]]:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[IV]], %[[BLOCK_A]] ], [ 99, %[[LOOP_LATCH]] ]
+; CHECK-NEXT: br label %[[LOOP_END1:.*]]
+; CHECK: [[BLOCK_A]]:
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 false)
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[IV]], [[TMP6]]
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[LOOP_END1]]:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[TMP7]], %[[BLOCK_A]] ], [ 99, %[[LOOP_END]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -258,22 +292,31 @@ loop.end:
define i64 @livein_exit_cond_in_conditional2(i1 %exit.cond) {
; CHECK-LABEL: define i64 @livein_exit_cond_in_conditional2(
; CHECK-SAME: i1 [[EXIT_COND:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[EXIT_COND]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
-; CHECK-NEXT: [[L_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT: [[BRANCH_COND:%.*]] = icmp slt i8 [[L_A]], 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[GEP_A]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT: [[BRANCH_COND:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
; CHECK-NEXT: br i1 [[BRANCH_COND]], label %[[BLOCK_A:.*]], label %[[LOOP_LATCH]]
-; CHECK: [[BLOCK_A]]:
-; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[LOOP_END:.*]], label %[[LOOP_LATCH]]
; CHECK: [[LOOP_LATCH]]:
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 64
-; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[LOOP_END]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[LOOP_END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[LOOP_END]]:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 99, %[[BLOCK_A]] ], [ [[IV]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT: br label %[[LOOP_END1:.*]]
+; CHECK: [[BLOCK_A]]:
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[LOOP_END1]]:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 99, %[[BLOCK_A]] ], [ 63, %[[LOOP_END]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -324,7 +367,7 @@ define i64 @diamond_with_join_then_exit() {
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
; CHECK-NEXT: br i1 [[TMP8]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[VECTOR_BODY_INTERIM]]
; CHECK: [[VECTOR_BODY_INTERIM]]:
-; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[LOOP_END:.*]]
; CHECK: [[VECTOR_EARLY_EXIT]]:
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
index 19973c9f2eea6..c13856e288fd5 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
@@ -402,8 +402,8 @@ define i64 @same_exit_block_pre_inc1_use_inv_cond(i1 %cond) {
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[COND]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true)
+; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
; CHECK-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP7]]
; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]])
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
>From 73f02ae00d6eb62e24522140b33582e11e7b42e9 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 17 Feb 2026 14:28:08 +0000
Subject: [PATCH 2/2] [VPlan] Support arbitrary predicated early exits.
This removes the restriction requiring a single predicated early exit.
Using MaskedCond, we only combine early-exit conditions with block
masks from non-exiting control flow.
This means we have to ensure that we check the early exit conditions in
program order, to make sure we take the first exit in program order that
exits at the first lane for the combined exit condition.
To do so, sort the exits by their reverse post-order numbers.
---
.../Vectorize/LoopVectorizationLegality.cpp | 31 --
.../Transforms/Vectorize/VPlanTransforms.cpp | 26 +-
.../LoopVectorize/early_exit_legality.ll | 7 +-
.../predicated-multiple-exits.ll | 423 +++++++++++-------
.../LoopVectorize/unsupported_early_exit.ll | 79 +---
5 files changed, 306 insertions(+), 260 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 67e195488850e..40bbf4b38b4f2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1745,37 +1745,6 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
return false;
}
- // Sort exiting blocks by dominance order to establish a clear chain.
- DT->updateDFSNumbers();
- llvm::sort(UncountableExitingBlocks, [this](BasicBlock *A, BasicBlock *B) {
- return DT->getNode(A)->getDFSNumIn() < DT->getNode(B)->getDFSNumIn();
- });
-
- // Verify that exits form a strict dominance chain: each block must
- // dominate the next. This ensures each exit is only dominated by its
- // predecessors in the chain.
- for (unsigned I = 0; I + 1 < UncountableExitingBlocks.size(); ++I) {
- if (!DT->properlyDominates(UncountableExitingBlocks[I],
- UncountableExitingBlocks[I + 1])) {
- reportVectorizationFailure(
- "Uncountable early exits do not form a dominance chain",
- "Cannot vectorize early exit loop with non-dominating exits",
- "NonDominatingEarlyExits", ORE, TheLoop);
- return false;
- }
- }
-
- // For predicated early exits, we only support a single early exit for now.
- BasicBlock *LatchPredBB = LatchBB->getUniquePredecessor();
- if (UncountableExitingBlocks.size() != 1 &&
- LatchPredBB != UncountableExitingBlocks.back()) {
- reportVectorizationFailure(
- "Last early exiting block in the chain is not the latch predecessor",
- "Cannot vectorize early exit loop", "EarlyExitNotLatchPredecessor", ORE,
- TheLoop);
- return false;
- }
-
// The latch block must have a countable exit.
if (isa<SCEVCouldNotCompute>(
PSE.getSE()->getPredicatedExitCount(TheLoop, LatchBB, &Predicates))) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e86fdd781252b..c1cf6d2659a86 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4113,10 +4113,28 @@ void VPlanTransforms::handleUncountableEarlyExits(VPlan &Plan,
}
assert(!Exits.empty() && "must have at least one early exit");
- // Sort exits by dominance to get the correct program order.
- llvm::sort(Exits, [&VPDT](const EarlyExitInfo &A, const EarlyExitInfo &B) {
- return VPDT.properlyDominates(A.EarlyExitingVPBB, B.EarlyExitingVPBB);
- });
+ // Sort exits by RPO order to get correct program order. RPO gives a
+ // topological ordering of the CFG, ensuring upstream exits are checked
+ // before downstream exits in the dispatch chain.
+ ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+ HeaderVPBB);
+ DenseMap<VPBlockBase *, unsigned> RPONumber;
+ unsigned Num = 0;
+ for (VPBlockBase *VPB : RPOT)
+ RPONumber[VPB] = Num++;
+ llvm::sort(
+ Exits, [&RPONumber](const EarlyExitInfo &A, const EarlyExitInfo &B) {
+ return RPONumber[A.EarlyExitingVPBB] < RPONumber[B.EarlyExitingVPBB];
+ });
+#ifndef NDEBUG
+ // After RPO sorting, verify that for any pair where one exit dominates
+ // another, the dominating exit comes first. This is guaranteed by RPO
+ // (topological order) and is required for the dispatch chain correctness.
+ for (unsigned I = 0; I + 1 < Exits.size(); ++I)
+ assert(!VPDT.properlyDominates(Exits[I + 1].EarlyExitingVPBB,
+ Exits[I].EarlyExitingVPBB) &&
+ "RPO sort must place dominating exits before dominated ones");
+#endif
// Build the AnyOf condition for the latch terminator using logical OR
// to avoid poison propagation from later exit conditions when an earlier
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
index 03b66d372c5f9..5d9f01368934d 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
@@ -558,9 +558,10 @@ loop.end:
; Two early exits on parallel branches (neither dominates the other).
+; This is now supported with predicated early exits.
define i64 @uncountable_exits_on_parallel_branches() {
; CHECK-LABEL: LV: Checking a loop in 'uncountable_exits_on_parallel_branches'
-; CHECK: LV: Not vectorizing: Uncountable early exits do not form a dominance chain.
+; CHECK: LV: We can vectorize this loop!
entry:
%p1 = alloca [1024 x i8]
%p2 = alloca [1024 x i8]
@@ -597,9 +598,11 @@ loop.end:
; Parallel uncountable exits with loop-invariant conditions.
+; Note: This loop cannot be vectorized because the latch has no determinate
+; exit count (loop is infinite without early exits).
define void @uncountable_exits_invariant_conditions(ptr %p, i1 %cond1, i1 %cond2, i1 %cond3) {
; CHECK-LABEL: LV: Checking a loop in 'uncountable_exits_invariant_conditions'
-; CHECK: LV: Not vectorizing: Uncountable early exits do not form a dominance chain.
+; CHECK: LV: Not vectorizing: Cannot determine exact exit count for latch block.
entry:
br label %loop.header
diff --git a/llvm/test/Transforms/LoopVectorize/predicated-multiple-exits.ll b/llvm/test/Transforms/LoopVectorize/predicated-multiple-exits.ll
index 3d13ececb7740..e15d4fde3d70e 100644
--- a/llvm/test/Transforms/LoopVectorize/predicated-multiple-exits.ll
+++ b/llvm/test/Transforms/LoopVectorize/predicated-multiple-exits.ll
@@ -8,31 +8,47 @@
define i64 @diamond_with_2_early_exits() {
; CHECK-LABEL: define i64 @diamond_with_2_early_exits() {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
-; CHECK-NEXT: [[L_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT: [[BRANCH_COND:%.*]] = icmp slt i8 [[L_A]], 0
-; CHECK-NEXT: br i1 [[BRANCH_COND]], label %[[BLOCK_A:.*]], label %[[BLOCK_B:.*]]
+; CHECK-NEXT: br label %[[BLOCK_A:.*]]
; CHECK: [[BLOCK_A]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], splat (i1 true)
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr @C, i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr @B, i64 [[IV]]
-; CHECK-NEXT: [[L_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-NEXT: [[EXT:%.*]] = zext i8 [[L_B]] to i64
-; CHECK-NEXT: [[CMP_A:%.*]] = icmp eq i8 [[L_A]], [[L_B]]
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[GEP_B]], align 1
+; CHECK-NEXT: [[TMP7:%.*]] = zext <4 x i8> [[WIDE_LOAD2]] to <4 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP5]], <4 x i1> splat (i1 true), <4 x i1> [[TMP9]]
+; CHECK-NEXT: [[TMP11:%.*]] = freeze <4 x i1> [[TMP10]]
+; CHECK-NEXT: [[CMP_A:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
; CHECK-NEXT: br i1 [[CMP_A]], label %[[LOOP_END:.*]], label %[[LOOP_LATCH]]
-; CHECK: [[BLOCK_B]]:
-; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr inbounds i8, ptr @C, i64 [[IV]]
-; CHECK-NEXT: [[L_C:%.*]] = load i8, ptr [[GEP_C]], align 1
-; CHECK-NEXT: [[CMP_B:%.*]] = icmp eq i8 [[L_A]], [[L_C]]
-; CHECK-NEXT: br i1 [[CMP_B]], label %[[LOOP_END]], label %[[LOOP_LATCH]]
; CHECK: [[LOOP_LATCH]]:
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 64
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP_HEADER]], label %[[LOOP_END]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[BLOCK_A]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[LOOP_END1:.*]]
; CHECK: [[LOOP_END]]:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[EXT]], %[[BLOCK_A]] ], [ [[IV]], %[[BLOCK_B]] ], [ 0, %[[LOOP_LATCH]] ]
+; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP10]], i1 false)
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP5]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br i1 [[TMP14]], label %[[VECTOR_EARLY_EXIT_0:.*]], label %[[VECTOR_EARLY_EXIT_1:.*]]
+; CHECK: [[VECTOR_EARLY_EXIT_1]]:
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[TMP7]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[VECTOR_EARLY_EXIT_0]]:
+; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[IV]], [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[LOOP_END1]]:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[TMP16]], %[[VECTOR_EARLY_EXIT_0]] ], [ [[TMP15]], %[[VECTOR_EARLY_EXIT_1]] ], [ 0, %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -70,38 +86,58 @@ loop.end:
define i64 @three_early_exits() {
; CHECK-LABEL: define i64 @three_early_exits() {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[CHECK_B:.*]] ]
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
-; CHECK-NEXT: [[L_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT: [[COND_A:%.*]] = icmp slt i8 [[L_A]], -42
-; CHECK-NEXT: br i1 [[COND_A]], label %[[BLOCK_A:.*]], label %[[CHECK_B:.*]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[GEP_A]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], splat (i8 -42)
+; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], splat (i1 true)
+; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], splat (i8 42)
+; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr @D, i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr @C, i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr @B, i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP8]], <4 x i1> splat (i1 true), <4 x i1> [[TMP12]]
+; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> splat (i1 true), <4 x i1> [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP17]]
+; CHECK-NEXT: [[COND_A:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]])
+; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; CHECK-NEXT: br i1 [[COND_A]], label %[[BLOCK_A:.*]], label %[[CHECK_B]]
; CHECK: [[CHECK_B]]:
-; CHECK-NEXT: [[COND_B:%.*]] = icmp slt i8 [[L_A]], 42
-; CHECK-NEXT: br i1 [[COND_B]], label %[[BLOCK_B:.*]], label %[[BLOCK_C:.*]]
-; CHECK: [[BLOCK_A]]:
-; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr @B, i64 [[IV]]
-; CHECK-NEXT: [[L_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-NEXT: [[CMP_A:%.*]] = icmp eq i8 [[L_A]], [[L_B]]
-; CHECK-NEXT: br i1 [[CMP_A]], label %[[LOOP_END:.*]], label %[[LOOP_LATCH]]
+; CHECK-NEXT: br i1 [[TMP20]], label %[[BLOCK_B:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[BLOCK_B]]:
-; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr inbounds i8, ptr @C, i64 [[IV]]
-; CHECK-NEXT: [[L_C:%.*]] = load i8, ptr [[GEP_C]], align 1
-; CHECK-NEXT: [[CMP_B:%.*]] = icmp eq i8 [[L_A]], [[L_C]]
-; CHECK-NEXT: br i1 [[CMP_B]], label %[[LOOP_END]], label %[[LOOP_LATCH]]
-; CHECK: [[BLOCK_C]]:
-; CHECK-NEXT: [[GEP_D:%.*]] = getelementptr inbounds i8, ptr @D, i64 [[IV]]
-; CHECK-NEXT: [[L_D:%.*]] = load i8, ptr [[GEP_D]], align 1
-; CHECK-NEXT: [[CMP_C:%.*]] = icmp eq i8 [[L_A]], [[L_D]]
-; CHECK-NEXT: br i1 [[CMP_C]], label %[[LOOP_END]], label %[[LOOP_LATCH]]
+; CHECK-NEXT: br label %[[LOOP_END1:.*]]
+; CHECK: [[BLOCK_A]]:
+; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 false)
+; CHECK-NEXT: [[CMP_B:%.*]] = extractelement <4 x i1> [[TMP8]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br i1 [[CMP_B]], label %[[LOOP_END:.*]], label %[[LOOP_LATCH:.*]]
; CHECK: [[LOOP_LATCH]]:
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 64
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP_HEADER]], label %[[LOOP_END]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP12]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br i1 [[TMP22]], label %[[VECTOR_EARLY_EXIT_1:.*]], label %[[VECTOR_EARLY_EXIT_2:.*]]
+; CHECK: [[VECTOR_EARLY_EXIT_2]]:
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[VECTOR_EARLY_EXIT_1]]:
+; CHECK-NEXT: br label %[[LOOP_END1]]
; CHECK: [[LOOP_END]]:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 1, %[[BLOCK_A]] ], [ 2, %[[BLOCK_B]] ], [ 3, %[[BLOCK_C]] ], [ 0, %[[LOOP_LATCH]] ]
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[LOOP_END1]]:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 3, %[[LOOP_END]] ], [ 2, %[[VECTOR_EARLY_EXIT_1]] ], [ 1, %[[VECTOR_EARLY_EXIT_2]] ], [ 0, %[[BLOCK_B]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -148,40 +184,58 @@ loop.end:
define i64 @nested_diamond_inner_exits() {
; CHECK-LABEL: define i64 @nested_diamond_inner_exits() {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
-; CHECK-NEXT: [[L_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT: [[OUTER_COND:%.*]] = icmp slt i8 [[L_A]], 0
-; CHECK-NEXT: br i1 [[OUTER_COND]], label %[[BLOCK_A:.*]], label %[[BLOCK_B:.*]]
+; CHECK-NEXT: br label %[[BLOCK_A:.*]]
; CHECK: [[BLOCK_A]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[BLOCK_A2:.*]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], splat (i1 true)
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr @D, i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr @B, i64 [[IV]]
-; CHECK-NEXT: [[L_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-NEXT: [[INNER_COND:%.*]] = icmp slt i8 [[L_B]], 0
-; CHECK-NEXT: br i1 [[INNER_COND]], label %[[BLOCK_A1:.*]], label %[[BLOCK_A2:.*]]
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[GEP_B]], align 1
+; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD2]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP7]], splat (i1 true)
+; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr @C, i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP7]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP13]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP5]], <4 x i1> splat (i1 true), <4 x i1> [[TMP12]]
+; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> splat (i1 true), <4 x i1> [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP17]]
+; CHECK-NEXT: [[INNER_COND:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]])
+; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; CHECK-NEXT: br i1 [[INNER_COND]], label %[[BLOCK_A1:.*]], label %[[BLOCK_A2]]
+; CHECK: [[BLOCK_A2]]:
+; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[BLOCK_A]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[LOOP_LATCH:.*]]
; CHECK: [[BLOCK_A1]]:
-; CHECK-NEXT: [[CMP_A1:%.*]] = icmp eq i8 [[L_A]], [[L_B]]
+; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 false)
+; CHECK-NEXT: [[CMP_A1:%.*]] = extractelement <4 x i1> [[TMP5]], i64 [[FIRST_ACTIVE_LANE]]
; CHECK-NEXT: br i1 [[CMP_A1]], label %[[LOOP_END:.*]], label %[[JOIN_A:.*]]
-; CHECK: [[BLOCK_A2]]:
-; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr inbounds i8, ptr @C, i64 [[IV]]
-; CHECK-NEXT: [[L_C:%.*]] = load i8, ptr [[GEP_C]], align 1
-; CHECK-NEXT: [[CMP_A2:%.*]] = icmp eq i8 [[L_A]], [[L_C]]
-; CHECK-NEXT: br i1 [[CMP_A2]], label %[[LOOP_END]], label %[[JOIN_A]]
; CHECK: [[JOIN_A]]:
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP12]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br i1 [[TMP22]], label %[[BLOCK_B:.*]], label %[[VECTOR_EARLY_EXIT_2:.*]]
+; CHECK: [[VECTOR_EARLY_EXIT_2]]:
; CHECK-NEXT: br label %[[LOOP_LATCH]]
; CHECK: [[BLOCK_B]]:
-; CHECK-NEXT: [[GEP_D:%.*]] = getelementptr inbounds i8, ptr @D, i64 [[IV]]
-; CHECK-NEXT: [[L_D:%.*]] = load i8, ptr [[GEP_D]], align 1
-; CHECK-NEXT: [[CMP_B:%.*]] = icmp eq i8 [[L_A]], [[L_D]]
-; CHECK-NEXT: br i1 [[CMP_B]], label %[[LOOP_END]], label %[[LOOP_LATCH]]
-; CHECK: [[LOOP_LATCH]]:
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 64
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP_HEADER]], label %[[LOOP_END]]
+; CHECK-NEXT: br label %[[LOOP_LATCH]]
; CHECK: [[LOOP_END]]:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 1, %[[BLOCK_A1]] ], [ 2, %[[BLOCK_A2]] ], [ 3, %[[BLOCK_B]] ], [ 0, %[[LOOP_LATCH]] ]
+; CHECK-NEXT: br label %[[LOOP_LATCH]]
+; CHECK: [[LOOP_LATCH]]:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 3, %[[LOOP_END]] ], [ 2, %[[BLOCK_B]] ], [ 1, %[[VECTOR_EARLY_EXIT_2]] ], [ 0, %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -231,35 +285,53 @@ loop.end:
define i64 @chain_of_3_exits() {
; CHECK-LABEL: define i64 @chain_of_3_exits() {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
-; CHECK-NEXT: [[L_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT: [[COND_A:%.*]] = icmp slt i8 [[L_A]], 0
-; CHECK-NEXT: br i1 [[COND_A]], label %[[BLOCK_A:.*]], label %[[LOOP_LATCH]]
+; CHECK-NEXT: br label %[[BLOCK_A:.*]]
; CHECK: [[BLOCK_A]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[BLOCK_C:.*]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr @B, i64 [[IV]]
-; CHECK-NEXT: [[L_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-NEXT: [[CMP_A:%.*]] = icmp eq i8 [[L_A]], [[L_B]]
-; CHECK-NEXT: br i1 [[CMP_A]], label %[[LOOP_END:.*]], label %[[BLOCK_B:.*]]
-; CHECK: [[BLOCK_B]]:
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[GEP_B]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr inbounds i8, ptr @C, i64 [[IV]]
-; CHECK-NEXT: [[L_C:%.*]] = load i8, ptr [[GEP_C]], align 1
-; CHECK-NEXT: [[CMP_B:%.*]] = icmp eq i8 [[L_A]], [[L_C]]
-; CHECK-NEXT: br i1 [[CMP_B]], label %[[LOOP_END]], label %[[BLOCK_C:.*]]
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[GEP_C]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr @D, i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> splat (i1 true), <4 x i1> [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i1> splat (i1 true), <4 x i1> [[TMP10]]
+; CHECK-NEXT: [[TMP13:%.*]] = freeze <4 x i1> [[TMP12]]
+; CHECK-NEXT: [[CMP_B:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
+; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; CHECK-NEXT: br i1 [[CMP_B]], label %[[LOOP_END:.*]], label %[[BLOCK_C]]
; CHECK: [[BLOCK_C]]:
-; CHECK-NEXT: [[GEP_D:%.*]] = getelementptr inbounds i8, ptr @D, i64 [[IV]]
-; CHECK-NEXT: [[L_D:%.*]] = load i8, ptr [[GEP_D]], align 1
-; CHECK-NEXT: [[CMP_C:%.*]] = icmp eq i8 [[L_A]], [[L_D]]
-; CHECK-NEXT: br i1 [[CMP_C]], label %[[LOOP_END]], label %[[LOOP_LATCH]]
-; CHECK: [[LOOP_LATCH]]:
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 64
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP_HEADER]], label %[[LOOP_END]]
+; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[BLOCK_A]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[LOOP_END1:.*]]
; CHECK: [[LOOP_END]]:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 1, %[[BLOCK_A]] ], [ 2, %[[BLOCK_B]] ], [ 3, %[[BLOCK_C]] ], [ 0, %[[LOOP_LATCH]] ]
+; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 false)
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP4]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_EARLY_EXIT_0:.*]], label %[[LOOP_LATCH:.*]]
+; CHECK: [[LOOP_LATCH]]:
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP7]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br i1 [[TMP17]], label %[[VECTOR_EARLY_EXIT_1:.*]], label %[[VECTOR_EARLY_EXIT_2:.*]]
+; CHECK: [[VECTOR_EARLY_EXIT_2]]:
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[VECTOR_EARLY_EXIT_1]]:
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[VECTOR_EARLY_EXIT_0]]:
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[LOOP_END1]]:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 3, %[[VECTOR_EARLY_EXIT_2]] ], [ 2, %[[VECTOR_EARLY_EXIT_1]] ], [ 1, %[[VECTOR_EARLY_EXIT_0]] ], [ 0, %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -302,41 +374,64 @@ loop.end:
define i64 @four_exits_2x2_diamond() {
; CHECK-LABEL: define i64 @four_exits_2x2_diamond() {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
-; CHECK-NEXT: [[L_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT: [[COND1:%.*]] = icmp slt i8 [[L_A]], 0
-; CHECK-NEXT: br i1 [[COND1]], label %[[BRANCH1_A:.*]], label %[[BRANCH1_B:.*]]
+; CHECK-NEXT: br label %[[BRANCH1_A:.*]]
; CHECK: [[BRANCH1_A]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[BRANCH2:.*]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], splat (i1 true)
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr @C, i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr @B, i64 [[IV]]
-; CHECK-NEXT: [[L_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-NEXT: [[CMP1A:%.*]] = icmp eq i8 [[L_A]], [[L_B]]
-; CHECK-NEXT: br i1 [[CMP1A]], label %[[LOOP_END:.*]], label %[[BRANCH2:.*]]
-; CHECK: [[BRANCH1_B]]:
-; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr inbounds i8, ptr @C, i64 [[IV]]
-; CHECK-NEXT: [[L_C:%.*]] = load i8, ptr [[GEP_C]], align 1
-; CHECK-NEXT: [[CMP1B:%.*]] = icmp eq i8 [[L_A]], [[L_C]]
-; CHECK-NEXT: br i1 [[CMP1B]], label %[[LOOP_END]], label %[[BRANCH2]]
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[GEP_B]], align 1
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr @D, i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; CHECK-NEXT: [[TMP10:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD3]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP10]], splat (i1 true)
+; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP10]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP5]], <4 x i1> splat (i1 true), <4 x i1> [[TMP8]]
+; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> splat (i1 true), <4 x i1> [[TMP13]]
+; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> splat (i1 true), <4 x i1> [[TMP15]]
+; CHECK-NEXT: [[TMP19:%.*]] = freeze <4 x i1> [[TMP18]]
+; CHECK-NEXT: [[CMP1A:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP19]])
+; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; CHECK-NEXT: br i1 [[CMP1A]], label %[[LOOP_END:.*]], label %[[BRANCH2]]
; CHECK: [[BRANCH2]]:
-; CHECK-NEXT: [[GEP_D:%.*]] = getelementptr inbounds i8, ptr @D, i64 [[IV]]
-; CHECK-NEXT: [[L_D:%.*]] = load i8, ptr [[GEP_D]], align 1
-; CHECK-NEXT: [[COND2:%.*]] = icmp slt i8 [[L_D]], 0
-; CHECK-NEXT: br i1 [[COND2]], label %[[BRANCH2_A:.*]], label %[[BRANCH2_B:.*]]
+; CHECK-NEXT: br i1 [[TMP21]], label %[[BRANCH2_A:.*]], label %[[BRANCH1_A]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[BRANCH2_A]]:
-; CHECK-NEXT: [[CMP2A:%.*]] = icmp eq i8 [[L_A]], [[L_D]]
-; CHECK-NEXT: br i1 [[CMP2A]], label %[[LOOP_END]], label %[[LOOP_LATCH]]
+; CHECK-NEXT: br label %[[LOOP_END1:.*]]
+; CHECK: [[LOOP_END]]:
+; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP18]], i1 false)
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP5]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br i1 [[TMP22]], label %[[VECTOR_EARLY_EXIT_0:.*]], label %[[BRANCH2_B:.*]]
; CHECK: [[BRANCH2_B]]:
-; CHECK-NEXT: [[CMP2B:%.*]] = icmp ne i8 [[L_A]], [[L_D]]
-; CHECK-NEXT: br i1 [[CMP2B]], label %[[LOOP_END]], label %[[LOOP_LATCH]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP8]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br i1 [[TMP23]], label %[[VECTOR_EARLY_EXIT_1:.*]], label %[[LOOP_LATCH:.*]]
; CHECK: [[LOOP_LATCH]]:
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 64
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP_HEADER]], label %[[LOOP_END]]
-; CHECK: [[LOOP_END]]:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 1, %[[BRANCH1_A]] ], [ 2, %[[BRANCH1_B]] ], [ 3, %[[BRANCH2_A]] ], [ 4, %[[BRANCH2_B]] ], [ 0, %[[LOOP_LATCH]] ]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP13]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br i1 [[TMP24]], label %[[VECTOR_EARLY_EXIT_2:.*]], label %[[VECTOR_EARLY_EXIT_3:.*]]
+; CHECK: [[VECTOR_EARLY_EXIT_3]]:
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[VECTOR_EARLY_EXIT_2]]:
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[VECTOR_EARLY_EXIT_1]]:
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[VECTOR_EARLY_EXIT_0]]:
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[LOOP_END1]]:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 4, %[[VECTOR_EARLY_EXIT_2]] ], [ 3, %[[VECTOR_EARLY_EXIT_3]] ], [ 2, %[[VECTOR_EARLY_EXIT_0]] ], [ 1, %[[VECTOR_EARLY_EXIT_1]] ], [ 0, %[[BRANCH2_A]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -410,7 +505,7 @@ define i64 @diamond_merge_then_exit_with_phi_liveout() {
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
; CHECK-NEXT: br i1 [[TMP6]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[VECTOR_BODY_INTERIM]]
; CHECK: [[VECTOR_BODY_INTERIM]]:
-; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[LOOP_END:.*]]
; CHECK: [[VECTOR_EARLY_EXIT]]:
@@ -469,30 +564,46 @@ loop.end:
; disambiguates the exits.
define i64 @diamond_exits_overlapping_conditions() {
; CHECK-LABEL: define i64 @diamond_exits_overlapping_conditions() {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[BLOCK_B:.*]] ]
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
-; CHECK-NEXT: [[L_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[GEP_A]], align 1
; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr @B, i64 [[IV]]
-; CHECK-NEXT: [[L_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[GEP_B]], align 1
; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr inbounds i8, ptr @C, i64 [[IV]]
-; CHECK-NEXT: [[L_C:%.*]] = load i8, ptr [[GEP_C]], align 1
-; CHECK-NEXT: [[COND:%.*]] = icmp slt i8 [[L_A]], 0
-; CHECK-NEXT: br i1 [[COND]], label %[[BLOCK_A:.*]], label %[[BLOCK_B:.*]]
-; CHECK: [[BLOCK_A]]:
-; CHECK-NEXT: [[CMP_A:%.*]] = icmp eq i8 [[L_A]], [[L_B]]
-; CHECK-NEXT: br i1 [[CMP_A]], label %[[LOOP_END:.*]], label %[[LOOP_LATCH]]
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[GEP_C]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP6]], <4 x i1> splat (i1 true), <4 x i1> [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = freeze <4 x i1> [[TMP9]]
+; CHECK-NEXT: [[COND:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; CHECK-NEXT: br i1 [[COND]], label %[[BLOCK_A:.*]], label %[[BLOCK_B]]
; CHECK: [[BLOCK_B]]:
-; CHECK-NEXT: [[CMP_B:%.*]] = icmp eq i8 [[L_A]], [[L_C]]
-; CHECK-NEXT: br i1 [[CMP_B]], label %[[LOOP_END]], label %[[LOOP_LATCH]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[LOOP_LATCH:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[LOOP_LATCH]]:
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 64
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP_HEADER]], label %[[LOOP_END]]
+; CHECK-NEXT: br label %[[LOOP_END1:.*]]
+; CHECK: [[BLOCK_A]]:
+; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP9]], i1 false)
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP6]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[VECTOR_EARLY_EXIT_0:.*]], label %[[LOOP_END:.*]]
; CHECK: [[LOOP_END]]:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[IV]], %[[BLOCK_A]] ], [ [[IV]], %[[BLOCK_B]] ], [ 0, %[[LOOP_LATCH]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[IV]], [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[VECTOR_EARLY_EXIT_0]]:
+; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[IV]], [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[LOOP_END1]]:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[TMP15]], %[[VECTOR_EARLY_EXIT_0]] ], [ [[TMP14]], %[[LOOP_END]] ], [ 0, %[[LOOP_LATCH]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -533,30 +644,42 @@ loop.end:
; block.c then has its own exit.
define i64 @exit_from_merge_of_exit_fallthrough_and_bypass() {
; CHECK-LABEL: define i64 @exit_from_merge_of_exit_fallthrough_and_bypass() {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
-; CHECK-NEXT: [[L_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT: [[COND:%.*]] = icmp slt i8 [[L_A]], 0
-; CHECK-NEXT: br i1 [[COND]], label %[[BLOCK_A:.*]], label %[[BLOCK_C:.*]]
+; CHECK-NEXT: br label %[[BLOCK_A:.*]]
; CHECK: [[BLOCK_A]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr @B, i64 [[IV]]
-; CHECK-NEXT: [[L_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-NEXT: [[CMP_A:%.*]] = icmp eq i8 [[L_A]], [[L_B]]
-; CHECK-NEXT: br i1 [[CMP_A]], label %[[LOOP_END:.*]], label %[[BLOCK_C]]
-; CHECK: [[BLOCK_C]]:
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[GEP_B]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr inbounds i8, ptr @C, i64 [[IV]]
-; CHECK-NEXT: [[L_C:%.*]] = load i8, ptr [[GEP_C]], align 1
-; CHECK-NEXT: [[CMP_C:%.*]] = icmp eq i8 [[L_A]], [[L_C]]
-; CHECK-NEXT: br i1 [[CMP_C]], label %[[LOOP_END]], label %[[LOOP_LATCH]]
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[GEP_C]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> splat (i1 true), <4 x i1> [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
+; CHECK-NEXT: [[CMP_C:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
+; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; CHECK-NEXT: br i1 [[CMP_C]], label %[[LOOP_END:.*]], label %[[LOOP_LATCH]]
; CHECK: [[LOOP_LATCH]]:
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 64
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP_HEADER]], label %[[LOOP_END]]
+; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[BLOCK_A]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[LOOP_END1:.*]]
; CHECK: [[LOOP_END]]:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 1, %[[BLOCK_A]] ], [ 2, %[[BLOCK_C]] ], [ 0, %[[LOOP_LATCH]] ]
+; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP7]], i1 false)
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP4]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br i1 [[TMP11]], label %[[VECTOR_EARLY_EXIT_0:.*]], label %[[VECTOR_EARLY_EXIT_1:.*]]
+; CHECK: [[VECTOR_EARLY_EXIT_1]]:
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[VECTOR_EARLY_EXIT_0]]:
+; CHECK-NEXT: br label %[[LOOP_END1]]
+; CHECK: [[LOOP_END1]]:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 2, %[[VECTOR_EARLY_EXIT_1]] ], [ 1, %[[VECTOR_EARLY_EXIT_0]] ], [ 0, %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll b/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll
index 614c98c6b8016..aa9a84fa0d5bb 100644
--- a/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll
@@ -120,7 +120,7 @@ define i64 @loop_contains_unsafe_call() {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[BAD_CALL:%.*]] = call i32 @foo(i32 [[LD1]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT: [[BAD_CALL:%.*]] = call i32 @foo(i32 [[LD1]]) #[[ATTR2:[0-9]+]]
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[BAD_CALL]], 34
; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
; CHECK: loop.inc:
@@ -428,78 +428,6 @@ loop.end:
ret i64 %retval
}
-
-; Two uncountable early exits in a diamond pattern - they don't dominate each
-; other, so we can't determine a clear program order for checking them.
-define i64 @uncountable_exits_in_diamond_pattern() {
-; CHECK-LABEL: define i64 @uncountable_exits_in_diamond_pattern() {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[GEP1]], align 1
-; CHECK-NEXT: [[BRANCH_COND:%.*]] = icmp slt i8 [[LD1]], 0
-; CHECK-NEXT: br i1 [[BRANCH_COND]], label [[BLOCK_A:%.*]], label [[BLOCK_B:%.*]]
-; CHECK: block.a:
-; CHECK-NEXT: [[GEP2A:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD2A:%.*]] = load i8, ptr [[GEP2A]], align 1
-; CHECK-NEXT: [[CMP_A:%.*]] = icmp eq i8 [[LD1]], [[LD2A]]
-; CHECK-NEXT: br i1 [[CMP_A]], label [[LOOP_END:%.*]], label [[LOOP_LATCH]]
-; CHECK: block.b:
-; CHECK-NEXT: [[GEP2B:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD2B:%.*]] = load i8, ptr [[GEP2B]], align 1
-; CHECK-NEXT: [[CMP_B:%.*]] = icmp eq i8 [[LD1]], [[LD2B]]
-; CHECK-NEXT: br i1 [[CMP_B]], label [[LOOP_END]], label [[LOOP_LATCH]]
-; CHECK: loop.latch:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 64
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_HEADER]], label [[LOOP_END]]
-; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 1, [[BLOCK_A]] ], [ 2, [[BLOCK_B]] ], [ 0, [[LOOP_LATCH]] ]
-; CHECK-NEXT: ret i64 [[RETVAL]]
-;
-entry:
- %p1 = alloca [1024 x i8], align 1
- %p2 = alloca [1024 x i8], align 1
- call void @init_mem(ptr %p1, i64 1024)
- call void @init_mem(ptr %p2, i64 1024)
- br label %loop.header
-
-loop.header:
- %index = phi i64 [ %index.next, %loop.latch ], [ 0, %entry ]
- %gep1 = getelementptr inbounds i8, ptr %p1, i64 %index
- %ld1 = load i8, ptr %gep1, align 1
- %branch.cond = icmp slt i8 %ld1, 0
- br i1 %branch.cond, label %block.a, label %block.b
-
-block.a:
- %gep2a = getelementptr inbounds i8, ptr %p2, i64 %index
- %ld2a = load i8, ptr %gep2a, align 1
- %cmp.a = icmp eq i8 %ld1, %ld2a
- br i1 %cmp.a, label %loop.end, label %loop.latch
-
-block.b:
- %gep2b = getelementptr inbounds i8, ptr %p2, i64 %index
- %ld2b = load i8, ptr %gep2b, align 1
- %cmp.b = icmp eq i8 %ld1, %ld2b
- br i1 %cmp.b, label %loop.end, label %loop.latch
-
-loop.latch:
- %index.next = add i64 %index, 1
- %exitcond = icmp ne i64 %index.next, 64
- br i1 %exitcond, label %loop.header, label %loop.end
-
-loop.end:
- %retval = phi i64 [ 1, %block.a ], [ 2, %block.b ], [ 0, %loop.latch ]
- ret i64 %retval
-}
-
-
; Two early exits with udiv in a non-exiting middle block between them.
; The udiv is only executed if the first early exit is not taken, so it
; needs predication. This should not be vectorized.
@@ -723,3 +651,8 @@ declare i32 @foo(i32) readonly
declare <vscale x 4 x i32> @foo_vec(<vscale x 4 x i32>)
attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
More information about the llvm-commits
mailing list