[llvm] [VPlan] Support arbitrary predicated early exits. (PR #182396)

Thu Feb 19 14:55:12 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Florian Hahn (fhahn)

<details>
<summary>Changes</summary>

This removes the restriction requiring a single predicated early exit.
Using MaskedCond, we only combine early-exit conditions with block
masks from non-exiting control flow.

This means we have to ensure that we check the early exit conditions in
program order, to make sure we take the first exit in program order that
exits at the first lane for the combined exit condition.

To do so, sort the exits by their reverse post-order numbers.

Depends on https://github.com/llvm/llvm-project/pull/182395 (included in PR)

---

Patch is 67.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/182396.diff


11 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp (-29) 
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+5-1) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+3) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+5) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+41-7) 
- (modified) llvm/test/Transforms/LoopVectorize/early_exit_legality.ll (+6-39) 
- (modified) llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/predicated-multiple-exits.ll (+273-150) 
- (modified) llvm/test/Transforms/LoopVectorize/predicated-single-exit.ll (+100-57) 
- (modified) llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll (+38-20) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index e57e0cf636501..40bbf4b38b4f2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1745,35 +1745,6 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
     return false;
   }
 
-  // Sort exiting blocks by dominance order to establish a clear chain.
-  DT->updateDFSNumbers();
-  llvm::sort(UncountableExitingBlocks, [this](BasicBlock *A, BasicBlock *B) {
-    return DT->getNode(A)->getDFSNumIn() < DT->getNode(B)->getDFSNumIn();
-  });
-
-  // Verify that exits form a strict dominance chain: each block must
-  // dominate the next. This ensures each exit is only dominated by its
-  // predecessors in the chain.
-  for (unsigned I = 0; I + 1 < UncountableExitingBlocks.size(); ++I) {
-    if (!DT->properlyDominates(UncountableExitingBlocks[I],
-                               UncountableExitingBlocks[I + 1])) {
-      reportVectorizationFailure(
-          "Uncountable early exits do not form a dominance chain",
-          "Cannot vectorize early exit loop with non-dominating exits",
-          "NonDominatingEarlyExits", ORE, TheLoop);
-      return false;
-    }
-  }
-
-  BasicBlock *LatchPredBB = LatchBB->getUniquePredecessor();
-  if (LatchPredBB != UncountableExitingBlocks.back()) {
-    reportVectorizationFailure(
-        "Last early exiting block in the chain is not the latch predecessor",
-        "Cannot vectorize early exit loop", "EarlyExitNotLatchPredecessor", ORE,
-        TheLoop);
-    return false;
-  }
-
   // The latch block must have a countable exit.
   if (isa<SCEVCouldNotCompute>(
           PSE.getSE()->getPredicatedExitCount(TheLoop, LatchBB, &Predicates))) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a0c23df0b3c38..7863a5d955d3b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1270,7 +1270,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     /// backedge value). Takes the wide induction recipe and the original
     /// backedge value as operands.
     ExitingIVValue,
-    OpsEnd = ExitingIVValue,
+    MaskedCond,
+    OpsEnd = MaskedCond,
   };
 
   /// Returns true if this VPInstruction generates scalar values for all lanes.
@@ -1304,6 +1305,9 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
 
   /// Returns true if the VPInstruction does not need masking.
   bool alwaysUnmasked() const {
+    if (Opcode == VPInstruction::MaskedCond)
+      return false;
+
     // For now only VPInstructions with underlying values use masks.
     // TODO: provide masks to VPInstructions w/o underlying values.
     if (!getUnderlyingValue())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 4b744b9128171..998e48d411f50 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -129,6 +129,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
            inferScalarType(R->getOperand(1))->isIntegerTy(1) &&
            "LogicalAnd/Or operands should be bool");
     return IntegerType::get(Ctx, 1);
+  case VPInstruction::MaskedCond:
+    assert(inferScalarType(R->getOperand(0))->isIntegerTy(1));
+    return IntegerType::get(Ctx, 1);
   case VPInstruction::BranchOnCond:
   case VPInstruction::BranchOnTwoConds:
   case VPInstruction::BranchOnCount:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 33cb1509565d5..5fd24fbedbe57 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -457,6 +457,7 @@ unsigned VPInstruction::getNumOperandsForOpcode() const {
   case VPInstruction::ExtractLastLane:
   case VPInstruction::ExtractLastPart:
   case VPInstruction::ExtractPenultimateElement:
+  case VPInstruction::MaskedCond:
   case VPInstruction::Not:
   case VPInstruction::ResumeForEpilogue:
   case VPInstruction::Reverse:
@@ -1345,6 +1346,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::LogicalAnd:
   case VPInstruction::LogicalOr:
+  case VPInstruction::MaskedCond:
   case VPInstruction::Not:
   case VPInstruction::PtrAdd:
   case VPInstruction::WideIVStep:
@@ -1491,6 +1493,9 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ExitingIVValue:
     O << "exiting-iv-value";
     break;
+  case VPInstruction::MaskedCond:
+    O << "masked-cond";
+    break;
   case VPInstruction::ExtractLane:
     O << "extract-lane";
     break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 22a8edaf30eb6..c1cf6d2659a86 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1438,6 +1438,15 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
     }
   }
 
+  if (match(Def, m_VPInstruction<VPInstruction::MaskedCond>())) {
+    if (Def->getNumOperands() == 2) {
+      VPValue *And = Builder.createNaryOp(
+          VPInstruction::LogicalAnd, {Def->getOperand(0), Def->getOperand(1)});
+      return Def->replaceAllUsesWith(And);
+    }
+    return Def->replaceAllUsesWith(Def->getOperand(0));
+  }
+
   // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
   // This is useful for fmax/fmin without fast-math flags, where we need to
   // check if any operand is NaN.
@@ -4080,10 +4089,17 @@ void VPlanTransforms::handleUncountableEarlyExits(VPlan &Plan,
           match(EarlyExitingVPBB->getTerminator(),
                 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
       assert(Matched && "Terminator must be BranchOnCond");
-      auto *CondToEarlyExit = TrueSucc == ExitBlock
-                                  ? CondOfEarlyExitingVPBB
-                                  : Builder.createNot(CondOfEarlyExitingVPBB);
+
+      // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
+      // the correct block mask.
+      VPBuilder EarlyExitBuilder(EarlyExitingVPBB->getTerminator());
+      auto *CondToEarlyExit = EarlyExitBuilder.createNaryOp(
+          VPInstruction::MaskedCond,
+          TrueSucc == ExitBlock
+              ? CondOfEarlyExitingVPBB
+              : EarlyExitBuilder.createNot(CondOfEarlyExitingVPBB));
       assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
+              !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
               VPDT.properlyDominates(
                   CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
                   LatchVPBB)) &&
@@ -4097,10 +4113,28 @@ void VPlanTransforms::handleUncountableEarlyExits(VPlan &Plan,
   }
 
   assert(!Exits.empty() && "must have at least one early exit");
-  // Sort exits by dominance to get the correct program order.
-  llvm::sort(Exits, [&VPDT](const EarlyExitInfo &A, const EarlyExitInfo &B) {
-    return VPDT.properlyDominates(A.EarlyExitingVPBB, B.EarlyExitingVPBB);
-  });
+  // Sort exits by RPO order to get correct program order. RPO gives a
+  // topological ordering of the CFG, ensuring upstream exits are checked
+  // before downstream exits in the dispatch chain.
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      HeaderVPBB);
+  DenseMap<VPBlockBase *, unsigned> RPONumber;
+  unsigned Num = 0;
+  for (VPBlockBase *VPB : RPOT)
+    RPONumber[VPB] = Num++;
+  llvm::sort(
+      Exits, [&RPONumber](const EarlyExitInfo &A, const EarlyExitInfo &B) {
+        return RPONumber[A.EarlyExitingVPBB] < RPONumber[B.EarlyExitingVPBB];
+      });
+#ifndef NDEBUG
+  // After RPO sorting, verify that for any pair where one exit dominates
+  // another, the dominating exit comes first. This is guaranteed by RPO
+  // (topological order) and is required for the dispatch chain correctness.
+  for (unsigned I = 0; I + 1 < Exits.size(); ++I)
+    assert(!VPDT.properlyDominates(Exits[I + 1].EarlyExitingVPBB,
+                                   Exits[I].EarlyExitingVPBB) &&
+           "RPO sort must place dominating exits before dominated ones");
+#endif
 
   // Build the AnyOf condition for the latch terminator using logical OR
   // to avoid poison propagation from later exit conditions when an earlier
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
index 2b68a4787b15a..5d9f01368934d 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
@@ -318,7 +318,7 @@ return:
 ; support this yet.
 define i64 @uncountable_exit_on_last_block() {
 ; CHECK-LABEL: LV: Checking a loop in 'uncountable_exit_on_last_block'
-; CHECK:       LV: Not vectorizing: Last early exiting block in the chain is not the latch predecessor.
+; CHECK:       LV: Not vectorizing: Cannot determine exact exit count for latch block.
 entry:
   %p1 = alloca [1024 x i8]
   %p2 = alloca [1024 x i8]
@@ -492,42 +492,6 @@ exit:                                             ; preds = %for.body
   ret void
 }
 
-define i64 @uncountable_exit_in_conditional_block(ptr %mask) {
-; CHECK-LABEL: LV: Checking a loop in 'uncountable_exit_in_conditional_block'
-; CHECK:       LV: Not vectorizing: Last early exiting block in the chain is not the latch predecessor.
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx1 = getelementptr inbounds i8, ptr %mask, i64 %index
-  %ld1 = load i8, ptr %arrayidx1, align 1
-  %cmp1 = icmp ne i8 %ld1, 0
-  br i1 %cmp1, label %loop.search, label %loop.inc
-
-loop.search:
-  %arrayidx2 = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld2 = load i8, ptr %arrayidx2, align 1
-  %arrayidx3 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld3 = load i8, ptr %arrayidx3, align 1
-  %cmp2 = icmp eq i8 %ld2, %ld3
-  br i1 %cmp2, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop.search ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
 define i64 @same_exit_block_pre_inc_use1_with_reduction() {
 ; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_with_reduction'
 ; CHECK:       LV: Not vectorizing: Found reductions or recurrences in early-exit loop.
@@ -594,9 +558,10 @@ loop.end:
 
 
 ; Two early exits on parallel branches (neither dominates the other).
+; This is now supported with predicated early exits.
 define i64 @uncountable_exits_on_parallel_branches() {
 ; CHECK-LABEL: LV: Checking a loop in 'uncountable_exits_on_parallel_branches'
-; CHECK:       LV: Not vectorizing: Uncountable early exits do not form a dominance chain.
+; CHECK:       LV: We can vectorize this loop!
 entry:
   %p1 = alloca [1024 x i8]
   %p2 = alloca [1024 x i8]
@@ -633,9 +598,11 @@ loop.end:
 
 
 ; Parallel uncountable exits with loop-invariant conditions.
+; Note: This loop cannot be vectorized because the latch has no determinate
+; exit count (loop is infinite without early exits).
 define void @uncountable_exits_invariant_conditions(ptr %p, i1 %cond1, i1 %cond2, i1 %cond3) {
 ; CHECK-LABEL: LV: Checking a loop in 'uncountable_exits_invariant_conditions'
-; CHECK:       LV: Not vectorizing: Uncountable early exits do not form a dominance chain.
+; CHECK:       LV: Not vectorizing: Cannot determine exact exit count for latch block.
 entry:
   br label %loop.header
 
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
index 55b52299d4331..bbdbd646cc2b3 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
@@ -461,7 +461,7 @@ exit:
 
 define void @loop_contains_store_uncounted_exit_is_not_guaranteed_to_execute(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) {
 ; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_uncounted_exit_is_not_guaranteed_to_execute'
-; CHECK:       LV: Not vectorizing: Last early exiting block in the chain is not the latch predecessor.
+; CHECK:       LV: Not vectorizing: Load for uncountable exit not guaranteed to execute.
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/predicated-multiple-exits.ll b/llvm/test/Transforms/LoopVectorize/predicated-multiple-exits.ll
index 3d13ececb7740..e15d4fde3d70e 100644
--- a/llvm/test/Transforms/LoopVectorize/predicated-multiple-exits.ll
+++ b/llvm/test/Transforms/LoopVectorize/predicated-multiple-exits.ll
@@ -8,31 +8,47 @@
 
 define i64 @diamond_with_2_early_exits() {
 ; CHECK-LABEL: define i64 @diamond_with_2_early_exits() {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT:    [[BRANCH_COND:%.*]] = icmp slt i8 [[L_A]], 0
-; CHECK-NEXT:    br i1 [[BRANCH_COND]], label %[[BLOCK_A:.*]], label %[[BLOCK_B:.*]]
+; CHECK-NEXT:    br label %[[BLOCK_A:.*]]
 ; CHECK:       [[BLOCK_A]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], splat (i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr @C, i64 [[IV]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i8, ptr @B, i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-NEXT:    [[EXT:%.*]] = zext i8 [[L_B]] to i64
-; CHECK-NEXT:    [[CMP_A:%.*]] = icmp eq i8 [[L_A]], [[L_B]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[GEP_B]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i8> [[WIDE_LOAD2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP5]], <4 x i1> splat (i1 true), <4 x i1> [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = freeze <4 x i1> [[TMP10]]
+; CHECK-NEXT:    [[CMP_A:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; CHECK-NEXT:    br i1 [[CMP_A]], label %[[LOOP_END:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[BLOCK_B]]:
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i8, ptr @C, i64 [[IV]]
-; CHECK-NEXT:    [[L_C:%.*]] = load i8, ptr [[GEP_C]], align 1
-; CHECK-NEXT:    [[CMP_B:%.*]] = icmp eq i8 [[L_A]], [[L_C]]
-; CHECK-NEXT:    br i1 [[CMP_B]], label %[[LOOP_END]], label %[[LOOP_LATCH]]
 ; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 64
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP_HEADER]], label %[[LOOP_END]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[BLOCK_A]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[LOOP_END1:.*]]
 ; CHECK:       [[LOOP_END]]:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[EXT]], %[[BLOCK_A]] ], [ [[IV]], %[[BLOCK_B]] ], [ 0, %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP10]], i1 false)
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP5]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[VECTOR_EARLY_EXIT_0:.*]], label %[[VECTOR_EARLY_EXIT_1:.*]]
+; CHECK:       [[VECTOR_EARLY_EXIT_1]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP7]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT:    br label %[[LOOP_END1]]
+; CHECK:       [[VECTOR_EARLY_EXIT_0]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[IV]], [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT:    br label %[[LOOP_END1]]
+; CHECK:       [[LOOP_END1]]:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[TMP16]], %[[VECTOR_EARLY_EXIT_0]] ], [ [[TMP15]], %[[VECTOR_EARLY_EXIT_1]] ], [ 0, %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -70,38 +86,58 @@ loop.end:
 
 define i64 @three_early_exits() {
 ; CHECK-LABEL: define i64 @three_early_exits() {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[INDEX_NEXT:%.*]], %[[CHECK_B:.*]] ]
 ; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i8, ptr @A, i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT:    [[COND_A:%.*]] = icmp slt i8 [[L_A]], -42
-; CHECK-NEXT:    br i1 [[COND_A]], label %[[BLOCK_A:.*]], label %[[CHECK_B:.*]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[GEP_A]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], splat (i8 -42)
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], splat (i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], splat (i8 42)
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr @D, i64 [[IV]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr @C, i64 [[IV]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr @B, i64 [[IV]]
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP8]], <4 x i1> splat (i1 true), <4 x i1> [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> splat (i1 true), <4 x i1> [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = freeze <4 x i1> [[TMP17]]
+; CHECK-NEXT:    [[COND_A:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; CHECK-NEXT:    br i1...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/182396