[llvm] [VPlan] Handle early exit loops with non-dereferenceable loads in latch (PR #172454)

Luke Lau via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 8 03:56:22 PST 2026


https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/172454

>From a98ba82faee120663d7d715ccb7f4ffad31ab24f Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 8 Jan 2026 18:26:09 +0800
Subject: [PATCH 1/3] Precommit tests

---
 .../AArch64/single-early-exit-successors.ll   | 59 +++++++++++++++++++
 .../LoopVectorize/RISCV/early-exit.ll         | 59 +++++++++++++++++++
 .../LoopVectorize/single_early_exit.ll        | 56 ++++++++++++++++++
 3 files changed, 174 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-successors.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/early-exit.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-successors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-successors.ll
new file mode 100644
index 0000000000000..6ed3c6af55033
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-successors.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p loop-vectorize -mtriple aarch64 -mattr=+sve -S %s | FileCheck %s
+
+declare void @init_mem(ptr, i64);
+
+define i8 @predicate_exit_block_successors(ptr %p0) {
+; CHECK-LABEL: define i8 @predicate_exit_block_successors(
+; CHECK-SAME: ptr [[P0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ 3, %[[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END:.*]]
+; CHECK:       [[LOOP_INC]]:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD3:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[LOOP_END]]
+; CHECK:       [[LOOP_END]]:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[LD1]], %[[LOOP]] ], [ [[LD3]], %[[LOOP_INC]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %arrayidx2 = getelementptr inbounds i8, ptr %p0, i64 %index
+  %ld3 = load i8, ptr %arrayidx2
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i8 [ %ld1, %loop ], [ %ld3, %loop.inc ]
+  ret i8 %retval
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/early-exit.ll b/llvm/test/Transforms/LoopVectorize/RISCV/early-exit.ll
new file mode 100644
index 0000000000000..91c31cf19cdbb
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/early-exit.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S < %s -p loop-vectorize -mtriple riscv64 -mattr=+v | FileCheck %s
+
+declare void @init_mem(ptr, i64)
+
+define i8 @predicate_exit_block_successors(ptr %p0) {
+; CHECK-LABEL: define i8 @predicate_exit_block_successors(
+; CHECK-SAME: ptr [[P0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ 3, %[[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END:.*]]
+; CHECK:       [[LOOP_INC]]:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD3:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[LOOP_END]]
+; CHECK:       [[LOOP_END]]:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[LD1]], %[[LOOP]] ], [ [[LD3]], %[[LOOP_INC]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %arrayidx2 = getelementptr inbounds i8, ptr %p0, i64 %index
+  %ld3 = load i8, ptr %arrayidx2
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i8 [ %ld1, %loop ], [ %ld3, %loop.inc ]
+  ret i8 %retval
+}
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index 71e2b82e2fdd1..8d0ba3ded4881 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -628,6 +628,62 @@ exit:
   %res = phi i64 [ -1, %entry ], [ -2, %then ], [ 0, %loop.latch ], [ %iv, %loop.header ]
   ret i64 %res
 }
+
+define i8 @predicate_exit_block_successors(ptr %p0) {
+; CHECK-LABEL: define i8 @predicate_exit_block_successors(
+; CHECK-SAME: ptr [[P0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD3:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[LD1]], [[LOOP]] ], [ [[LD3]], [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %arrayidx2 = getelementptr inbounds i8, ptr %p0, i64 %index
+  %ld3 = load i8, ptr %arrayidx2
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i8 [ %ld1, %loop ], [ %ld3, %loop.inc ]
+  ret i8 %retval
+}
+
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}

>From 3c380d34b7ee9df0d8be84b4efa9e51f3951e9cd Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 8 Jan 2026 18:34:25 +0800
Subject: [PATCH 2/3] [VPlan] Handle early exit loops with predicated
 successors

---
 .../Vectorize/LoopVectorizationLegality.cpp   |  9 +++
 .../Vectorize/VPlanConstruction.cpp           | 34 +++++++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 17 +++-
 .../AArch64/single-early-exit-successors.ll   | 73 ++++++++++++++---
 .../LoopVectorize/RISCV/early-exit.ll         | 74 ++++++++++++++---
 .../LoopVectorize/single_early_exit.ll        | 79 ++++++++++++++++---
 6 files changed, 249 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 93229ea625a5d..7b56c35e817cc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
@@ -1840,6 +1841,9 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
 
   // Check non-dereferenceable loads if any.
   for (LoadInst *LI : NonDerefLoads) {
+    // Occurs after the early exit, so we can predicate it.
+    if (DT->properlyDominates(SingleUncountableExitingBlock, LI->getParent()))
+      continue;
     // Only support unit-stride access for now.
     int Stride = isConsecutivePtr(LI->getType(), LI->getPointerOperand());
     if (Stride != 1) {
@@ -2038,6 +2042,11 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
         else
           return false;
       }
+      // isVectorizableEarlyExitLoop will have predicated some instructions when
+      // they previously weren't. Call canVectorizeWithIfConvert again to
+      // repopulate MaskedOp with any new instructions.
+      if (!canVectorizeWithIfConvert())
+        return false;
     }
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index a193d10703f03..07e518016c1cc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -875,10 +875,31 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan,
 
   // Disconnect all early exits from the loop leaving it with a single exit from
   // the latch. Early exits that are countable are left for a scalar epilog. The
+  // latch is split to contain only the terminator, and the uncountable exiting
+  // blocks are connected to it, with a predicated edge to their successors. The
   // condition of uncountable early exits (currently at most one is supported)
   // is fused into the latch exit, and used to branch from middle block to the
   // early exit destination.
+  //
+  // BEFORE:                  | AFTER:
+  // 			      |
+  //    entry                 |      entry
+  //      |                   |        |
+  //      v                   |        v
+  // earlyexiting -----+      |   earlyexiting-+
+  //      |            |      |        |       |
+  //      v            v      |   [predicated] |
+  //    latch      earlyexit  |        |       |
+  //                          |        v       |
+  //                          |      latch     |
+  //                          |        |       |
+  //                          |        +-------+
+  //                          |        |
+  //                          |        v
+  //                          |    latch.split
+
   [[maybe_unused]] bool HandledUncountableEarlyExit = false;
+  VPBasicBlock *LatchSplitVPBB = nullptr;
   for (VPIRBasicBlock *EB : Plan.getExitBlocks()) {
     for (VPBlockBase *Pred : to_vector(EB->getPredecessors())) {
       if (Pred == MiddleVPBB)
@@ -886,15 +907,22 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan,
       if (HasUncountableEarlyExit) {
         assert(!HandledUncountableEarlyExit &&
                "can handle exactly one uncountable early exit");
+        if (!LatchSplitVPBB)
+          LatchSplitVPBB =
+              LatchVPBB->splitAt(LatchVPBB->getTerminator()->getIterator());
         handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan,
-                                   cast<VPBasicBlock>(HeaderVPB), LatchVPBB);
+                                   cast<VPBasicBlock>(HeaderVPB),
+                                   LatchSplitVPBB);
         HandledUncountableEarlyExit = true;
+        cast<VPBasicBlock>(Pred)->getTerminator()->eraseFromParent();
+        VPBlockUtils::disconnectBlocks(Pred, EB);
+        VPBlockUtils::connectBlocks(Pred, LatchSplitVPBB);
       } else {
         for (VPRecipeBase &R : EB->phis())
           cast<VPIRPhi>(&R)->removeIncomingValueFor(Pred);
+        cast<VPBasicBlock>(Pred)->getTerminator()->eraseFromParent();
+        VPBlockUtils::disconnectBlocks(Pred, EB);
       }
-      cast<VPBasicBlock>(Pred)->getTerminator()->eraseFromParent();
-      VPBlockUtils::disconnectBlocks(Pred, EB);
     }
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b1880517a4199..cbc8b5e4d16df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3926,7 +3926,7 @@ void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
       cast<VPIRPhi>(&R)->swapOperands();
   }
 
-  VPBuilder Builder(LatchVPBB->getTerminator());
+  VPBuilder Builder(EarlyExitingVPBB->getTerminator());
   VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
   assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
          "Terminator must be be BranchOnCond");
@@ -3936,6 +3936,18 @@ void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
                               ? CondOfEarlyExitingVPBB
                               : Builder.createNot(CondOfEarlyExitingVPBB);
 
+  // Create a mask and predicate any "exited" lanes in successor blocks.
+  VPValue *FirstActiveLane =
+      Builder.createNaryOp(VPInstruction::FirstActiveLane, {CondToEarlyExit},
+                           DebugLoc::getUnknown(), "first.active.lane");
+  VPValue *SuccMask = Builder.createICmp(
+      CmpInst::ICMP_ULT,
+      Builder.createNaryOp(VPInstruction::StepVector, {},
+                           Type::getInt64Ty(Plan.getContext())),
+      FirstActiveLane);
+  Builder.createNaryOp(VPInstruction::BranchOnCond, SuccMask);
+  Builder.setInsertPoint(LatchVPBB->getTerminator());
+
   // Create a BranchOnTwoConds in the latch that branches to:
   // [0] vector.early.exit, [1] middle block, [2] header (continue looping).
   VPValue *IsEarlyExitTaken =
@@ -3963,9 +3975,6 @@ void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
     VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
     if (!IncomingFromEarlyExit->isLiveIn()) {
       // Update the incoming value from the early exit.
-      VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
-          VPInstruction::FirstActiveLane, {CondToEarlyExit},
-          DebugLoc::getUnknown(), "first.active.lane");
       IncomingFromEarlyExit = EarlyExitB.createNaryOp(
           VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
           DebugLoc::getUnknown(), "early.exit.value");
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-successors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-successors.ll
index 6ed3c6af55033..084fa080397c4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-successors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-successors.ll
@@ -11,23 +11,72 @@ define i8 @predicate_exit_block_successors(ptr %p0) {
 ; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
 ; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
 ; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i64> @llvm.stepvector.nxv16i64()
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP8]], i1 false)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[FIRST_ACTIVE_LANE]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult <vscale x 16 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P0]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP10]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = freeze <vscale x 16 x i1> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i32 [[TMP15]], 16
+; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], i32 [[TMP17]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[LOOP_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 16
+; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 16 x i8> [[WIDE_LOAD]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT:    br label %[[LOOP_END]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ 3, %[[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
 ; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END:.*]]
+; CHECK-NEXT:    br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END]]
 ; CHECK:       [[LOOP_INC]]:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX2]]
 ; CHECK-NEXT:    [[LD3:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX2]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[LOOP_END]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[LOOP_END]]:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[LD1]], %[[LOOP]] ], [ [[LD3]], %[[LOOP_INC]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[LD1]], %[[LOOP]] ], [ [[LD3]], %[[LOOP_INC]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[TMP22]], %[[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i8 [[RETVAL]]
 ;
 entry:
@@ -57,3 +106,9 @@ loop.end:
   %retval = phi i8 [ %ld1, %loop ], [ %ld3, %loop.inc ]
   ret i8 %retval
 }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/early-exit.ll b/llvm/test/Transforms/LoopVectorize/RISCV/early-exit.ll
index 91c31cf19cdbb..45b890ac23136 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/early-exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/early-exit.ll
@@ -11,23 +11,73 @@ define i8 @predicate_exit_block_successors(ptr %p0) {
 ; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
 ; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
 ; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 34)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[UMAX]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i64> @llvm.stepvector.nxv16i64()
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP8]], i1 false)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[FIRST_ACTIVE_LANE]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult <vscale x 16 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P0]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP10]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = freeze <vscale x 16 x i1> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i32 [[TMP15]], 16
+; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], i32 [[TMP17]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[LOOP_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 16
+; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 16 x i8> [[WIDE_LOAD]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT:    br label %[[LOOP_END]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ 3, %[[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
 ; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END:.*]]
+; CHECK-NEXT:    br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END]]
 ; CHECK:       [[LOOP_INC]]:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX2]]
 ; CHECK-NEXT:    [[LD3:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX2]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[LOOP_END]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[LOOP_END]]:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[LD1]], %[[LOOP]] ], [ [[LD3]], %[[LOOP_INC]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[LD1]], %[[LOOP]] ], [ [[LD3]], %[[LOOP_INC]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[TMP22]], %[[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i8 [[RETVAL]]
 ;
 entry:
@@ -57,3 +107,9 @@ loop.end:
   %retval = phi i8 [ %ld1, %loop ], [ %ld3, %loop.inc ]
   ret i8 %retval
 }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index 8d0ba3ded4881..90f1c45b29c84 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -638,22 +638,76 @@ define i8 @predicate_exit_block_successors(ptr %p0) {
 ; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
 ; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[LOOP]] ], [ [[INDEX_NEXT9:%.*]], [[PRED_LOAD_CONTINUE8:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 false)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[FIRST_ACTIVE_LANE]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i64> <i64 0, i64 1, i64 2, i64 3>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[CMP3:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC:%.*]], label [[LOOP_END:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[LD3:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i8> poison, i8 [[LD3]], i32 0
+; CHECK-NEXT:    br label [[LOOP_END]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[LOOP_INC]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; CHECK:       pred.load.if3:
+; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX_NEXT]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i8> [[TMP9]], i8 [[TMP13]], i32 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
+; CHECK:       pred.load.continue4:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i8> [ [[TMP9]], [[LOOP_END]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
+; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
+; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP15]], i8 [[TMP19]], i32 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.continue6:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i8> [ [[TMP15]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP20]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
+; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8]]
+; CHECK:       pred.load.if7:
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP25]], i32 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
+; CHECK:       pred.load.continue8:
+; CHECK-NEXT:    [[TMP27:%.*]] = phi <4 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP26]], [[PRED_LOAD_IF7]] ]
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT:    [[TMP28:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT:    [[TMP29:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP28]])
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT9]], 64
+; CHECK-NEXT:    [[TMP31:%.*]] = or i1 [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    br i1 [[TMP31]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP29]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i8> [[TMP27]], i32 3
+; CHECK-NEXT:    br label [[LOOP_END1:%.*]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i8> [[WIDE_LOAD]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT:    br label [[LOOP_END1]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[LD1]], [[LOOP]] ], [ [[LD3]], [[LOOP_INC]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[TMP33]], [[VECTOR_EARLY_EXIT]] ], [ [[TMP32]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i8 [[RETVAL]]
 ;
 entry:
@@ -698,4 +752,5 @@ loop.end:
 ; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]}
 ;.

>From 5f77430e37cc378d65660353378f5a50ab385dea Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 8 Jan 2026 19:53:38 +0800
Subject: [PATCH 3/3] Untabify diagram

---
 llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 07e518016c1cc..1cf77946a1337 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -882,7 +882,7 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan,
   // early exit destination.
   //
   // BEFORE:                  | AFTER:
-  // 			      |
+  //                          |
   //    entry                 |      entry
   //      |                   |        |
   //      v                   |        v



More information about the llvm-commits mailing list