[llvm] [VPlan] Handle early exit loops with non-dereferenceable loads in latch (PR #172454)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 8 03:56:22 PST 2026
https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/172454
>From a98ba82faee120663d7d715ccb7f4ffad31ab24f Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 8 Jan 2026 18:26:09 +0800
Subject: [PATCH 1/3] Precommit tests
---
.../AArch64/single-early-exit-successors.ll | 59 +++++++++++++++++++
.../LoopVectorize/RISCV/early-exit.ll | 59 +++++++++++++++++++
.../LoopVectorize/single_early_exit.ll | 56 ++++++++++++++++++
3 files changed, 174 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-successors.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/early-exit.ll
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-successors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-successors.ll
new file mode 100644
index 0000000000000..6ed3c6af55033
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-successors.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p loop-vectorize -mtriple aarch64 -mattr=+sve -S %s | FileCheck %s
+
+declare void @init_mem(ptr, i64);
+
+define i8 @predicate_exit_block_successors(ptr %p0) {
+; CHECK-LABEL: define i8 @predicate_exit_block_successors(
+; CHECK-SAME: ptr [[P0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ 3, %[[ENTRY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT: br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END:.*]]
+; CHECK: [[LOOP_INC]]:
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX]]
+; CHECK-NEXT: [[LD3:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[LOOP_END]]
+; CHECK: [[LOOP_END]]:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i8 [ [[LD1]], %[[LOOP]] ], [ [[LD3]], %[[LOOP_INC]] ]
+; CHECK-NEXT: ret i8 [[RETVAL]]
+;
+entry:
+ %p1 = alloca [1024 x i8]
+ %p2 = alloca [1024 x i8]
+ call void @init_mem(ptr %p1, i64 1024)
+ call void @init_mem(ptr %p2, i64 1024)
+ br label %loop
+
+loop:
+ %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+ %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+ %ld1 = load i8, ptr %arrayidx, align 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+ %ld2 = load i8, ptr %arrayidx1, align 1
+ %cmp3 = icmp eq i8 %ld1, %ld2
+ br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+ %arrayidx2 = getelementptr inbounds i8, ptr %p0, i64 %index
+ %ld3 = load i8, ptr %arrayidx2
+ %index.next = add i64 %index, 1
+ %exitcond = icmp ne i64 %index.next, 67
+ br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+ %retval = phi i8 [ %ld1, %loop ], [ %ld3, %loop.inc ]
+ ret i8 %retval
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/early-exit.ll b/llvm/test/Transforms/LoopVectorize/RISCV/early-exit.ll
new file mode 100644
index 0000000000000..91c31cf19cdbb
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/early-exit.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S < %s -p loop-vectorize -mtriple riscv64 -mattr=+v | FileCheck %s
+
+declare void @init_mem(ptr, i64)
+
+define i8 @predicate_exit_block_successors(ptr %p0) {
+; CHECK-LABEL: define i8 @predicate_exit_block_successors(
+; CHECK-SAME: ptr [[P0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ 3, %[[ENTRY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT: br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END:.*]]
+; CHECK: [[LOOP_INC]]:
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX]]
+; CHECK-NEXT: [[LD3:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[LOOP_END]]
+; CHECK: [[LOOP_END]]:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i8 [ [[LD1]], %[[LOOP]] ], [ [[LD3]], %[[LOOP_INC]] ]
+; CHECK-NEXT: ret i8 [[RETVAL]]
+;
+entry:
+ %p1 = alloca [1024 x i8]
+ %p2 = alloca [1024 x i8]
+ call void @init_mem(ptr %p1, i64 1024)
+ call void @init_mem(ptr %p2, i64 1024)
+ br label %loop
+
+loop:
+ %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+ %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+ %ld1 = load i8, ptr %arrayidx, align 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+ %ld2 = load i8, ptr %arrayidx1, align 1
+ %cmp3 = icmp eq i8 %ld1, %ld2
+ br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+ %arrayidx2 = getelementptr inbounds i8, ptr %p0, i64 %index
+ %ld3 = load i8, ptr %arrayidx2
+ %index.next = add i64 %index, 1
+ %exitcond = icmp ne i64 %index.next, 67
+ br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+ %retval = phi i8 [ %ld1, %loop ], [ %ld3, %loop.inc ]
+ ret i8 %retval
+}
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index 71e2b82e2fdd1..8d0ba3ded4881 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -628,6 +628,62 @@ exit:
%res = phi i64 [ -1, %entry ], [ -2, %then ], [ 0, %loop.latch ], [ %iv, %loop.header ]
ret i64 %res
}
+
+define i8 @predicate_exit_block_successors(ptr %p0) {
+; CHECK-LABEL: define i8 @predicate_exit_block_successors(
+; CHECK-SAME: ptr [[P0:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK: loop.inc:
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX]]
+; CHECK-NEXT: [[LD3:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK: loop.end:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i8 [ [[LD1]], [[LOOP]] ], [ [[LD3]], [[LOOP_INC]] ]
+; CHECK-NEXT: ret i8 [[RETVAL]]
+;
+entry:
+ %p1 = alloca [1024 x i8]
+ %p2 = alloca [1024 x i8]
+ call void @init_mem(ptr %p1, i64 1024)
+ call void @init_mem(ptr %p2, i64 1024)
+ br label %loop
+
+loop:
+ %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+ %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+ %ld1 = load i8, ptr %arrayidx, align 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+ %ld2 = load i8, ptr %arrayidx1, align 1
+ %cmp3 = icmp eq i8 %ld1, %ld2
+ br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+ %arrayidx2 = getelementptr inbounds i8, ptr %p0, i64 %index
+ %ld3 = load i8, ptr %arrayidx2
+ %index.next = add i64 %index, 1
+ %exitcond = icmp ne i64 %index.next, 67
+ br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+ %retval = phi i8 [ %ld1, %loop ], [ %ld3, %loop.inc ]
+ ret i8 %retval
+}
+
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
>From 3c380d34b7ee9df0d8be84b4efa9e51f3951e9cd Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 8 Jan 2026 18:34:25 +0800
Subject: [PATCH 2/3] [VPlan] Handle early exit loops with predicated
successors
---
.../Vectorize/LoopVectorizationLegality.cpp | 9 +++
.../Vectorize/VPlanConstruction.cpp | 34 +++++++-
.../Transforms/Vectorize/VPlanTransforms.cpp | 17 +++-
.../AArch64/single-early-exit-successors.ll | 73 ++++++++++++++---
.../LoopVectorize/RISCV/early-exit.ll | 74 ++++++++++++++---
.../LoopVectorize/single_early_exit.ll | 79 ++++++++++++++++---
6 files changed, 249 insertions(+), 37 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 93229ea625a5d..7b56c35e817cc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -25,6 +25,7 @@
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
@@ -1840,6 +1841,9 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
// Check non-dereferenceable loads if any.
for (LoadInst *LI : NonDerefLoads) {
+ // Occurs after the early exit, so we can predicate it.
+ if (DT->properlyDominates(SingleUncountableExitingBlock, LI->getParent()))
+ continue;
// Only support unit-stride access for now.
int Stride = isConsecutivePtr(LI->getType(), LI->getPointerOperand());
if (Stride != 1) {
@@ -2038,6 +2042,11 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
else
return false;
}
+ // isVectorizableEarlyExitLoop will have predicated some instructions when
+ // they previously weren't. Call canVectorizeWithIfConvert again to
+ // repopulate MaskedOp with any new instructions.
+ if (!canVectorizeWithIfConvert())
+ return false;
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index a193d10703f03..07e518016c1cc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -875,10 +875,31 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan,
// Disconnect all early exits from the loop leaving it with a single exit from
// the latch. Early exits that are countable are left for a scalar epilog. The
+ // latch is split to contain only the terminator, and the uncountable exiting
+ // blocks are connected to it, with a predicated edge to their successors. The
// condition of uncountable early exits (currently at most one is supported)
// is fused into the latch exit, and used to branch from middle block to the
// early exit destination.
+ //
+ // BEFORE: | AFTER:
+ // |
+ // entry | entry
+ // | | |
+ // v | v
+ // earlyexiting -----+ | earlyexiting-+
+ // | | | | |
+ // v v | [predicated] |
+ // latch earlyexit | | |
+ // | v |
+ // | latch |
+ // | | |
+ // | +-------+
+ // | |
+ // | v
+ // | latch.split
+
[[maybe_unused]] bool HandledUncountableEarlyExit = false;
+ VPBasicBlock *LatchSplitVPBB = nullptr;
for (VPIRBasicBlock *EB : Plan.getExitBlocks()) {
for (VPBlockBase *Pred : to_vector(EB->getPredecessors())) {
if (Pred == MiddleVPBB)
@@ -886,15 +907,22 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan,
if (HasUncountableEarlyExit) {
assert(!HandledUncountableEarlyExit &&
"can handle exactly one uncountable early exit");
+ if (!LatchSplitVPBB)
+ LatchSplitVPBB =
+ LatchVPBB->splitAt(LatchVPBB->getTerminator()->getIterator());
handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan,
- cast<VPBasicBlock>(HeaderVPB), LatchVPBB);
+ cast<VPBasicBlock>(HeaderVPB),
+ LatchSplitVPBB);
HandledUncountableEarlyExit = true;
+ cast<VPBasicBlock>(Pred)->getTerminator()->eraseFromParent();
+ VPBlockUtils::disconnectBlocks(Pred, EB);
+ VPBlockUtils::connectBlocks(Pred, LatchSplitVPBB);
} else {
for (VPRecipeBase &R : EB->phis())
cast<VPIRPhi>(&R)->removeIncomingValueFor(Pred);
+ cast<VPBasicBlock>(Pred)->getTerminator()->eraseFromParent();
+ VPBlockUtils::disconnectBlocks(Pred, EB);
}
- cast<VPBasicBlock>(Pred)->getTerminator()->eraseFromParent();
- VPBlockUtils::disconnectBlocks(Pred, EB);
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b1880517a4199..cbc8b5e4d16df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3926,7 +3926,7 @@ void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
cast<VPIRPhi>(&R)->swapOperands();
}
- VPBuilder Builder(LatchVPBB->getTerminator());
+ VPBuilder Builder(EarlyExitingVPBB->getTerminator());
VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
"Terminator must be be BranchOnCond");
@@ -3936,6 +3936,18 @@ void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
? CondOfEarlyExitingVPBB
: Builder.createNot(CondOfEarlyExitingVPBB);
+ // Create a mask and predicate any "exited" lanes in successor blocks.
+ VPValue *FirstActiveLane =
+ Builder.createNaryOp(VPInstruction::FirstActiveLane, {CondToEarlyExit},
+ DebugLoc::getUnknown(), "first.active.lane");
+ VPValue *SuccMask = Builder.createICmp(
+ CmpInst::ICMP_ULT,
+ Builder.createNaryOp(VPInstruction::StepVector, {},
+ Type::getInt64Ty(Plan.getContext())),
+ FirstActiveLane);
+ Builder.createNaryOp(VPInstruction::BranchOnCond, SuccMask);
+ Builder.setInsertPoint(LatchVPBB->getTerminator());
+
// Create a BranchOnTwoConds in the latch that branches to:
// [0] vector.early.exit, [1] middle block, [2] header (continue looping).
VPValue *IsEarlyExitTaken =
@@ -3963,9 +3975,6 @@ void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
if (!IncomingFromEarlyExit->isLiveIn()) {
// Update the incoming value from the early exit.
- VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
- VPInstruction::FirstActiveLane, {CondToEarlyExit},
- DebugLoc::getUnknown(), "first.active.lane");
IncomingFromEarlyExit = EarlyExitB.createNaryOp(
VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
DebugLoc::getUnknown(), "early.exit.value");
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-successors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-successors.ll
index 6ed3c6af55033..084fa080397c4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-successors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-successors.ll
@@ -11,23 +11,72 @@ define i8 @predicate_exit_block_successors(ptr %p0) {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 4
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 16 x i64> @llvm.stepvector.nxv16i64()
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP8]], i1 false)
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[FIRST_ACTIVE_LANE]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = icmp ult <vscale x 16 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P0]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP10]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> poison)
+; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP3]]
+; CHECK-NEXT: [[TMP11:%.*]] = freeze <vscale x 16 x i1> [[TMP8]]
+; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP11]])
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
+; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_SPLIT]]:
+; CHECK-NEXT: br i1 [[TMP12]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP16:%.*]] = mul nuw i32 [[TMP15]], 16
+; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP16]], 1
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], i32 [[TMP17]]
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[LOOP_END:.*]], label %[[SCALAR_PH]]
+; CHECK: [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 16
+; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 0
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <vscale x 16 x i8> [[WIDE_LOAD]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br label %[[LOOP_END]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ 3, %[[ENTRY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
+; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END:.*]]
+; CHECK-NEXT: br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END]]
; CHECK: [[LOOP_INC]]:
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD3:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX2]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[LOOP_END]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[LOOP_END]]:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i8 [ [[LD1]], %[[LOOP]] ], [ [[LD3]], %[[LOOP_INC]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i8 [ [[LD1]], %[[LOOP]] ], [ [[LD3]], %[[LOOP_INC]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[TMP22]], %[[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i8 [[RETVAL]]
;
entry:
@@ -57,3 +106,9 @@ loop.end:
%retval = phi i8 [ %ld1, %loop ], [ %ld3, %loop.inc ]
ret i8 %retval
}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/early-exit.ll b/llvm/test/Transforms/LoopVectorize/RISCV/early-exit.ll
index 91c31cf19cdbb..45b890ac23136 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/early-exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/early-exit.ll
@@ -11,23 +11,73 @@ define i8 @predicate_exit_block_successors(ptr %p0) {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 34)
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[UMAX]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 16 x i64> @llvm.stepvector.nxv16i64()
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP8]], i1 false)
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[FIRST_ACTIVE_LANE]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = icmp ult <vscale x 16 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P0]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP10]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> poison)
+; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP3]]
+; CHECK-NEXT: [[TMP11:%.*]] = freeze <vscale x 16 x i1> [[TMP8]]
+; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP11]])
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
+; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_SPLIT]]:
+; CHECK-NEXT: br i1 [[TMP12]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP16:%.*]] = mul nuw i32 [[TMP15]], 16
+; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP16]], 1
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], i32 [[TMP17]]
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[LOOP_END:.*]], label %[[SCALAR_PH]]
+; CHECK: [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 16
+; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 0
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <vscale x 16 x i8> [[WIDE_LOAD]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br label %[[LOOP_END]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ 3, %[[ENTRY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
+; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END:.*]]
+; CHECK-NEXT: br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END]]
; CHECK: [[LOOP_INC]]:
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD3:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX2]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[LOOP_END]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[LOOP_END]]:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i8 [ [[LD1]], %[[LOOP]] ], [ [[LD3]], %[[LOOP_INC]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i8 [ [[LD1]], %[[LOOP]] ], [ [[LD3]], %[[LOOP_INC]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[TMP22]], %[[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i8 [[RETVAL]]
;
entry:
@@ -57,3 +107,9 @@ loop.end:
%retval = phi i8 [ %ld1, %loop ], [ %ld3, %loop.inc ]
ret i8 %retval
}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index 8d0ba3ded4881..90f1c45b29c84 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -638,22 +638,76 @@ define i8 @predicate_exit_block_successors(ptr %p0) {
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[LOOP]] ], [ [[INDEX_NEXT9:%.*]], [[PRED_LOAD_CONTINUE8:%.*]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK: loop.inc:
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 false)
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[FIRST_ACTIVE_LANE]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i64> <i64 0, i64 1, i64 2, i64 3>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[CMP3:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC:%.*]], label [[LOOP_END:%.*]]
+; CHECK: pred.load.if:
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[TMP5]]
; CHECK-NEXT: [[LD3:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i8> poison, i8 [[LD3]], i32 0
+; CHECK-NEXT: br label [[LOOP_END]]
+; CHECK: pred.load.continue:
+; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[LOOP_INC]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
+; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; CHECK: pred.load.if3:
+; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[INDEX_NEXT]]
+; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i8> [[TMP9]], i8 [[TMP13]], i32 1
+; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]]
+; CHECK: pred.load.continue4:
+; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i8> [ [[TMP9]], [[LOOP_END]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
+; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
+; CHECK: pred.load.if5:
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[TMP17]]
+; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
+; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP15]], i8 [[TMP19]], i32 2
+; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]]
+; CHECK: pred.load.continue6:
+; CHECK-NEXT: [[TMP21:%.*]] = phi <4 x i8> [ [[TMP15]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP20]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
+; CHECK-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8]]
+; CHECK: pred.load.if7:
+; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = load i8, ptr [[TMP24]], align 1
+; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP25]], i32 3
+; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]]
+; CHECK: pred.load.continue8:
+; CHECK-NEXT: [[TMP27:%.*]] = phi <4 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP26]], [[PRED_LOAD_IF7]] ]
+; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[TMP28:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT: [[TMP29:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP28]])
+; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT9]], 64
+; CHECK-NEXT: [[TMP31:%.*]] = or i1 [[TMP29]], [[TMP30]]
+; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK: middle.split:
+; CHECK-NEXT: br i1 [[TMP29]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i8> [[TMP27]], i32 3
+; CHECK-NEXT: br label [[LOOP_END1:%.*]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i8> [[WIDE_LOAD]], i64 [[FIRST_ACTIVE_LANE]]
+; CHECK-NEXT: br label [[LOOP_END1]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i8 [ [[LD1]], [[LOOP]] ], [ [[LD3]], [[LOOP_INC]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i8 [ [[TMP33]], [[VECTOR_EARLY_EXIT]] ], [ [[TMP32]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i8 [[RETVAL]]
;
entry:
@@ -698,4 +752,5 @@ loop.end:
; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]}
; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]}
;.
>From 5f77430e37cc378d65660353378f5a50ab385dea Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 8 Jan 2026 19:53:38 +0800
Subject: [PATCH 3/3] Untabify diagram
---
llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 07e518016c1cc..1cf77946a1337 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -882,7 +882,7 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan,
// early exit destination.
//
// BEFORE: | AFTER:
- // |
+ // |
// entry | entry
// | | |
// v | v
More information about the llvm-commits
mailing list