[llvm] [LV] Vectorize conditional scalar assignments (PR #158088)
Graham Hunter via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 26 09:13:01 PST 2025
https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/158088
>From 34f27987a8eb1aa34f09f5763f1f1e156329a1e4 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 26 Mar 2025 11:35:01 +0000
Subject: [PATCH 01/21] [LV] Vectorize conditional scalar assignments
Based on Michael Maitland's previous work:
https://github.com/llvm/llvm-project/pull/121222
This PR uses the existing recurrences code instead of introducing a
new pass just for CSA autovec. I've also made recipes that are more
generic.
I've enabled it by default to see the impact on tests; if there are
regressions we can put it behind a cli option.
---
llvm/include/llvm/Analysis/IVDescriptors.h | 23 +-
llvm/lib/Analysis/IVDescriptors.cpp | 45 +-
.../AArch64/AArch64TargetTransformInfo.cpp | 1 +
.../Transforms/Vectorize/LoopVectorize.cpp | 53 +-
.../Transforms/Vectorize/SLPVectorizer.cpp | 3 +
llvm/lib/Transforms/Vectorize/VPlan.h | 37 ++
.../Transforms/Vectorize/VPlanAnalysis.cpp | 19 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 47 ++
.../Transforms/Vectorize/VPlanTransforms.cpp | 78 +++
.../Transforms/Vectorize/VPlanTransforms.h | 8 +
llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 2 +-
llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 +
.../AArch64/conditional-scalar-assignment.ll | 397 ++++++++++++
.../conditional-scalar-assignment-vplan.ll | 123 ++++
.../LoopVectorize/iv-select-cmp-decreasing.ll | 339 +++++++++--
.../LoopVectorize/iv-select-cmp-no-wrap.ll | 88 ++-
.../iv-select-cmp-non-const-iv-start.ll | 373 ++++++++++--
.../LoopVectorize/iv-select-cmp-trunc.ll | 570 ++++++++++++++----
.../Transforms/LoopVectorize/iv-select-cmp.ll | 190 +++++-
.../Transforms/LoopVectorize/select-cmp.ll | 141 ++++-
20 files changed, 2238 insertions(+), 300 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 2c8484fde5b16..e2d99ef629af2 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -70,6 +70,9 @@ enum class RecurKind {
FindLastIVUMax, ///< FindLast reduction with select(cmp(),x,y) where one of
///< (x,y) is increasing loop induction, and both x and y
///< are integer type, producing a UMax reduction.
+ FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y
+ ///< are an integer type, one is the current recurrence value,
+ ///< and the other is an arbitrary value.
// clang-format on
// TODO: Any_of and FindLast reduction need not be restricted to integer type
// only.
@@ -175,13 +178,12 @@ class RecurrenceDescriptor {
/// Returns a struct describing whether the instruction is either a
/// Select(ICmp(A, B), X, Y), or
/// Select(FCmp(A, B), X, Y)
- /// where one of (X, Y) is an increasing (FindLast) or decreasing (FindFirst)
- /// loop induction variable, and the other is a PHI value.
- // TODO: Support non-monotonic variable. FindLast does not need be restricted
- // to increasing loop induction variables.
- LLVM_ABI static InstDesc isFindIVPattern(RecurKind Kind, Loop *TheLoop,
- PHINode *OrigPhi, Instruction *I,
- ScalarEvolution &SE);
+ /// where one of (X, Y) is an increasing (FindLastIV) or decreasing
+ /// (FindFirstIV) loop induction variable, or an arbitrary integer value
+ /// (FindLast), and the other is a PHI value.
+ LLVM_ABI static InstDesc isFindPattern(RecurKind Kind, Loop *TheLoop,
+ PHINode *OrigPhi, Instruction *I,
+ ScalarEvolution &SE);
/// Returns a struct describing if the instruction is a
/// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
@@ -305,6 +307,13 @@ class RecurrenceDescriptor {
isFindLastIVRecurrenceKind(Kind);
}
+ /// Returns true if the recurrence kind is of the form
+ /// select(cmp(),x,y) where one of (x,y) is an arbitrary value and the
+ /// other is a recurrence.
+ static bool isFindLastRecurrenceKind(RecurKind Kind) {
+ return Kind == RecurKind::FindLast;
+ }
+
/// Returns the type of the recurrence. This type can be narrower than the
/// actual type of the Phi if the recurrence has been type-promoted.
Type *getRecurrenceType() const { return RecurrenceType; }
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 4d21f1c7e2de2..b34e204dddd0f 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -58,6 +58,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ // TODO: Make type-agnostic.
+ case RecurKind::FindLast:
return true;
}
return false;
@@ -695,9 +697,9 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi,
// value of the data type or a non-constant value by using mask and multiple
// reduction operations.
RecurrenceDescriptor::InstDesc
-RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
- PHINode *OrigPhi, Instruction *I,
- ScalarEvolution &SE) {
+RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop,
+ PHINode *OrigPhi, Instruction *I,
+ ScalarEvolution &SE) {
// TODO: Support the vectorization of FindLastIV when the reduction phi is
// used by more than one select instruction. This vectorization is only
// performed when the SCEV of each increasing induction variable used by the
@@ -706,8 +708,10 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
return InstDesc(false, I);
// We are looking for selects of the form:
- // select(cmp(), phi, loop_induction) or
- // select(cmp(), loop_induction, phi)
+ // select(cmp(), phi, value) or
+ // select(cmp(), value, phi)
+ // where 'value' is be a loop induction variable
+ // (for FindFirstIV/FindLastIV) or an arbitrary value (for FindLast).
// TODO: Match selects with multi-use cmp conditions.
Value *NonRdxPhi = nullptr;
if (!match(I, m_CombineOr(m_Select(m_OneUse(m_Cmp()), m_Value(NonRdxPhi),
@@ -716,6 +720,25 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
m_Value(NonRdxPhi)))))
return InstDesc(false, I);
+ if (isFindLastRecurrenceKind(Kind)) {
+ // Must be an integer scalar.
+ Type *Type = OrigPhi->getType();
+ if (!Type->isIntegerTy())
+ return InstDesc(false, I);
+
+ // FIXME: Support more complex patterns, including multiple selects.
+ // The Select must be used only outside the loop and by the PHI.
+ for (User *U : I->users()) {
+ if (U == OrigPhi)
+ continue;
+ if (auto *UI = dyn_cast<Instruction>(U); UI && !TheLoop->contains(UI))
+ continue;
+ return InstDesc(false, I);
+ }
+
+ return InstDesc(I, RecurKind::FindLast);
+ }
+
// Returns either FindFirstIV/FindLastIV, if such a pattern is found, or
// std::nullopt.
auto GetRecurKind = [&](Value *V) -> std::optional<RecurKind> {
@@ -925,8 +948,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
Kind == RecurKind::Add || Kind == RecurKind::Mul ||
Kind == RecurKind::Sub || Kind == RecurKind::AddChainWithSubs)
return isConditionalRdxPattern(I);
- if (isFindIVRecurrenceKind(Kind) && SE)
- return isFindIVPattern(Kind, L, OrigPhi, I, *SE);
+ if ((isFindIVRecurrenceKind(Kind) || isFindLastRecurrenceKind(Kind)) && SE)
+ return isFindPattern(Kind, L, OrigPhi, I, *SE);
[[fallthrough]];
case Instruction::FCmp:
case Instruction::ICmp:
@@ -1123,7 +1146,11 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
<< "\n");
return true;
}
-
+ if (AddReductionVar(Phi, RecurKind::FindLast, TheLoop, FMF, RedDes, DB, AC,
+ DT, SE)) {
+ LLVM_DEBUG(dbgs() << "Found a FindLast reduction PHI." << *Phi << "\n");
+ return true;
+ }
// Not a reduction of known type.
return false;
}
@@ -1248,6 +1275,8 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
case RecurKind::FMaximumNum:
case RecurKind::FMinimumNum:
return Instruction::FCmp;
+ case RecurKind::FindLast:
+ return Instruction::Select;
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
case RecurKind::FindFirstIVUMin:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0bae00bafee3c..a587a17c08f2d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5471,6 +5471,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
case RecurKind::FMax:
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
+ case RecurKind::FindLast:
return true;
default:
return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 393586e504c17..633d200e126c2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1293,6 +1293,7 @@ class LoopVectorizationCostModel {
"from latch block\n");
return true;
}
+
if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
"interleaved group requires scalar epilogue\n");
@@ -4084,6 +4085,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
continue;
case VPDef::VPReductionSC:
case VPDef::VPActiveLaneMaskPHISC:
+ case VPDef::VPLastActiveMaskPHISC:
case VPDef::VPWidenCallSC:
case VPDef::VPWidenCanonicalIVSC:
case VPDef::VPWidenCastSC:
@@ -4302,11 +4304,15 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
ElementCount VF) const {
// Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
// reductions need special handling and are currently unsupported.
+ // FindLast reductions also require special handling for the synthesized
+ // mask PHI.
if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
if (!Legal->isReductionVariable(&Phi))
return Legal->isFixedOrderRecurrence(&Phi);
- return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(
- Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind());
+ RecurKind Kind =
+ Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind();
+ return RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) ||
+ RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(Kind);
}))
return false;
@@ -4612,6 +4618,12 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
IsaPred<VPReductionPHIRecipe>);
+ // FIXME: implement interleaving for FindLast transform correctly.
+ for (auto &[_, RdxDesc] : Legal->getReductionVars())
+ if (RecurrenceDescriptor::isFindLastRecurrenceKind(
+ RdxDesc.getRecurrenceKind()))
+ return 1;
+
// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0) {
@@ -8586,6 +8598,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
*Plan, Builder))
return nullptr;
+ // Create whole-vector selects for find-last recurrences.
+ VPlanTransforms::runPass(VPlanTransforms::convertFindLastRecurrences, *Plan,
+ RecipeBuilder, Legal);
+
if (useActiveLaneMask(Style)) {
// TODO: Move checks to VPlanTransforms::addActiveLaneMask once
// TailFoldingStyle is visible there.
@@ -8669,10 +8685,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
continue;
RecurKind Kind = PhiR->getRecurrenceKind();
- assert(
- !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
- !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
- "AnyOf and FindIV reductions are not allowed for in-loop reductions");
+ assert(!RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) &&
+ !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
+ !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
+ "AnyOf, FindIV, and FindLast reductions are not allowed for in-loop "
+ "reductions");
bool IsFPRecurrence =
RecurrenceDescriptor::isFloatingPointRecurrenceKind(Kind);
@@ -8976,7 +8993,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
RecurKind RK = RdxDesc.getRecurrenceKind();
if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) &&
!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
- !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) {
+ !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) &&
+ !RecurrenceDescriptor::isFindLastRecurrenceKind(RK))) {
VPBuilder PHBuilder(Plan->getVectorPreheader());
VPValue *Iden = Plan->getOrAddLiveIn(
getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
@@ -9389,7 +9407,7 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
for (VPRecipeBase &R :
EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
- if (isa<VPCanonicalIVPHIRecipe>(&R))
+ if (isa<VPCanonicalIVPHIRecipe, VPLastActiveMaskPHIRecipe>(&R))
continue;
EpiWidenedPhis.insert(
cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
@@ -9586,6 +9604,10 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
continue;
}
}
+ } else if (isa<VPLastActiveMaskPHIRecipe>(R)) {
+ // LastActiveMasks are only used as part of FindLast reductions,
+ // and aren't passed to the scalar loop.
+ continue;
} else {
// Retrieve the induction resume values for wide inductions from
// their original phi nodes in the scalar loop.
@@ -10107,6 +10129,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Override IC if user provided an interleave count.
IC = UserIC > 0 ? UserIC : IC;
+ // FIXME: Enable interleaving for last_active reductions.
+ if (any_of(make_second_range(LVL.getReductionVars()), [&](auto &RdxDesc) {
+ return RecurrenceDescriptor::isFindLastRecurrenceKind(
+ RdxDesc.getRecurrenceKind());
+ })) {
+ LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
+ << "to conditional scalar assignments.\n");
+ IntDiagMsg = {
+ "ConditionalAssignmentPreventsScalarInterleaving",
+ "Unable to interleave without vectorization due to conditional "
+ "assignments"};
+ InterleaveLoop = false;
+ IC = 1;
+ }
+
// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3b36ccbd677dc..103030d0028cb 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -25381,6 +25381,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
@@ -25522,6 +25523,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
@@ -25628,6 +25630,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0c7d9c0193a03..1081efa21dbb3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -561,6 +561,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPPredInstPHISC:
case VPRecipeBase::VPCanonicalIVPHISC:
case VPRecipeBase::VPActiveLaneMaskPHISC:
+ case VPRecipeBase::VPLastActiveMaskPHISC:
case VPRecipeBase::VPFirstOrderRecurrencePHISC:
case VPRecipeBase::VPWidenPHISC:
case VPRecipeBase::VPWidenIntOrFpInductionSC:
@@ -1121,6 +1122,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
/// Returns the value for vscale.
VScale,
OpsEnd = VScale,
+ /// Extracts the last active lane based on a predicate vector operand.
+ ExtractLastActive,
};
/// Returns true if this VPInstruction generates scalar values for all lanes.
@@ -3639,6 +3642,40 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
#endif
};
+// TODO: Can we unify the PHI recipe hierarchy a bit? VPPredInstPHISC is close
+// to this (just a PHI of a predicate), but isn't a header phi so can't
+// be used for the mask of FindLastActive reductions.
+//
+// This is basically a clone of VPActiveLaneMaskPHIRecipe, but won't run into
+// problems with transforms that expect there to only be a single ALM PHI, and
+// can be ignored by other code looking for a (non-existent) underlying value.
+class VPLastActiveMaskPHIRecipe : public VPHeaderPHIRecipe {
+public:
+ VPLastActiveMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)
+ : VPHeaderPHIRecipe(VPDef::VPLastActiveMaskPHISC, nullptr, StartMask,
+ DL) {}
+
+ ~VPLastActiveMaskPHIRecipe() override = default;
+
+ VPLastActiveMaskPHIRecipe *clone() override {
+ auto *R = new VPLastActiveMaskPHIRecipe(getOperand(0), getDebugLoc());
+ if (getNumOperands() == 2)
+ R->addOperand(getOperand(1));
+ return R;
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPLastActiveMaskPHISC);
+
+ /// Generate the mask phi
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A recipe for generating the phi node for the current index of elements,
/// adjusted in accordance with EVL value. It starts at the start value of the
/// canonical induction and gets incremented by EVL in each iteration of the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 80a2e4bc3f754..3168d6652f03b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -118,7 +118,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return Type::getIntNTy(Ctx, 64);
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractLastLanePerPart:
- case VPInstruction::ExtractPenultimateElement: {
+ case VPInstruction::ExtractPenultimateElement:
+ case VPInstruction::ExtractLastActive: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
return VecTy->getElementType();
@@ -276,14 +277,14 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
.Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe,
VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe,
- VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe>(
- [this](const auto *R) {
- // Handle header phi recipes, except VPWidenIntOrFpInduction
- // which needs special handling due it being possibly truncated.
- // TODO: consider inferring/caching type of siblings, e.g.,
- // backedge value, here and in cases below.
- return inferScalarType(R->getStartValue());
- })
+ VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe,
+ VPLastActiveMaskPHIRecipe>([this](const auto *R) {
+ // Handle header phi recipes, except VPWidenIntOrFpInduction
+ // which needs special handling due it being possibly truncated.
+ // TODO: consider inferring/caching type of siblings, e.g.,
+ // backedge value, here and in cases below.
+ return inferScalarType(R->getStartValue());
+ })
.Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
[](const auto *R) { return R->getScalarType(); })
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1c88b56ca89dc..13a802c5287d5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -591,6 +591,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ActiveLaneMask:
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ReductionStartVector:
+ case VPInstruction::ExtractLastActive:
return 3;
case VPInstruction::ComputeFindIVResult:
return 4;
@@ -1048,6 +1049,17 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
case VPInstruction::ResumeForEpilogue:
return State.get(getOperand(0), true);
+ case VPInstruction::ExtractLastActive: {
+ Value *Data = State.get(getOperand(0));
+ Value *Mask = State.get(getOperand(1));
+ Value *Default = State.get(getOperand(2), /*IsScalar=*/true);
+ Type *VTy = Data->getType();
+
+ Module *M = State.Builder.GetInsertBlock()->getModule();
+ Function *ExtractLast = Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::experimental_vector_extract_last_active, {VTy});
+ return Builder.CreateCall(ExtractLast, {Data, Mask, Default});
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -1184,6 +1196,15 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
+ case VPInstruction::ExtractLastActive: {
+ Type *ScalarTy = Ctx.Types.inferScalarType(this);
+ Type *VecTy = toVectorTy(ScalarTy, VF);
+ Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+ IntrinsicCostAttributes ICA(
+ Intrinsic::experimental_vector_extract_last_active, ScalarTy,
+ {VecTy, MaskTy, ScalarTy});
+ return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
+ }
case VPInstruction::FirstOrderRecurrenceSplice: {
assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
SmallVector<int> Mask(VF.getKnownMinValue());
@@ -1240,6 +1261,7 @@ bool VPInstruction::isVectorToScalar() const {
getOpcode() == VPInstruction::FirstActiveLane ||
getOpcode() == VPInstruction::ComputeAnyOfResult ||
getOpcode() == VPInstruction::ComputeFindIVResult ||
+ getOpcode() == VPInstruction::ExtractLastActive ||
getOpcode() == VPInstruction::ComputeReductionResult ||
getOpcode() == VPInstruction::AnyOf;
}
@@ -1305,6 +1327,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::ActiveLaneMask:
case VPInstruction::ExplicitVectorLength:
case VPInstruction::FirstActiveLane:
+ case VPInstruction::ExtractLastActive:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::Not:
@@ -1490,6 +1513,9 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
case VPInstruction::Unpack:
O << "unpack";
break;
+ case VPInstruction::ExtractLastActive:
+ O << "extract-last-active";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -4501,6 +4527,27 @@ void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPLastActiveMaskPHIRecipe::execute(VPTransformState &State) {
+ BasicBlock *VectorPH =
+ State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
+ Value *StartMask = State.get(getOperand(0));
+ PHINode *Phi =
+ State.Builder.CreatePHI(StartMask->getType(), 2, "last.active.mask");
+ Phi->addIncoming(StartMask, VectorPH);
+ State.set(this, Phi);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPLastActiveMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "LAST-ACTIVE-MASK-PHI ";
+
+ printAsOperand(O, SlotTracker);
+ O << " = phi ";
+ printOperands(O, SlotTracker);
+}
+#endif
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPEVLBasedIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8bf1003e923b1..d72b6360ee541 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -41,6 +41,7 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/TypeSize.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
using namespace llvm;
using namespace VPlanPatternMatch;
@@ -4902,3 +4903,80 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
}
}
}
+
+void VPlanTransforms::convertFindLastRecurrences(
+ VPlan &Plan, VPRecipeBuilder &RecipeBuilder,
+ LoopVectorizationLegality *Legal) {
+ assert(Legal && "Need valid LoopVecLegality");
+
+ // May need to do something better than this?
+ if (Plan.hasScalarVFOnly())
+ return;
+
+ // We want to create the following nodes:
+ // vec.body:
+ // mask.phi = phi <VF x i1> [ all.false, vec.ph ], [ new.mask, vec.body ]
+ // ...data.phi already exists, but needs updating...
+ // data.phi = phi <VF x Ty> [ default.val, vec.ph ], [ new.data, vec.body ]
+ //
+ // ...'data' and 'compare' created by existing nodes...
+ //
+ // any_active = i1 any_of_reduction(compare)
+ // new.mask = select any_active, compare, mask.phi
+ // new.data = select any_active, data, data.phi
+ //
+ // middle.block:
+ // result = extract-last-active new.data, new.mask, default.val
+
+ for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
+ if (RecurrenceDescriptor::isFindLastRecurrenceKind(
+ RdxDesc.getRecurrenceKind())) {
+ VPRecipeBase *PhiR = RecipeBuilder.getRecipe(Phi);
+ VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR);
+
+ // Add mask phi
+ VPValue *False =
+ Plan.getOrAddLiveIn(ConstantInt::getFalse(Phi->getContext()));
+ auto *MaskPHI = new VPLastActiveMaskPHIRecipe(False, DebugLoc());
+ Builder.insert(MaskPHI);
+
+ // Find the condition for the select
+ SelectInst *Select = cast<SelectInst>(RdxDesc.getLoopExitInstr());
+ auto *SR = cast<VPWidenSelectRecipe>(RecipeBuilder.getRecipe(Select));
+ VPValue *Cond = SR->getCond();
+
+ // Add select for mask
+ Builder.setInsertPoint(SR);
+ VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond});
+ VPValue *MaskSelect = Builder.createSelect(AnyOf, Cond, MaskPHI);
+ MaskPHI->addOperand(MaskSelect);
+
+ // Replace select for data
+ VPValue *DataSelect = Builder.createSelect(
+ AnyOf, SR->getOperand(1), SR->getOperand(2), SR->getDebugLoc());
+ SR->replaceAllUsesWith(DataSelect);
+ SR->eraseFromParent();
+
+ // Find final reduction and replace it with an
+ // extract.last.active intrinsic.
+ VPInstruction *RdxResult = nullptr;
+ for (VPUser *U : DataSelect->users()) {
+ VPInstruction *I = dyn_cast<VPInstruction>(U);
+ if (I && I->getOpcode() == VPInstruction::ComputeReductionResult) {
+ RdxResult = I;
+ break;
+ }
+ }
+
+ assert(RdxResult);
+ Builder.setInsertPoint(RdxResult);
+ VPValue *Default = RecipeBuilder.getVPValueOrAddLiveIn(
+ RdxDesc.getRecurrenceStartValue());
+ auto *ExtractLastActive = Builder.createNaryOp(
+ VPInstruction::ExtractLastActive, {DataSelect, MaskSelect, Default},
+ RdxResult->getDebugLoc());
+ RdxResult->replaceAllUsesWith(ExtractLastActive);
+ RdxResult->eraseFromParent();
+ }
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 6245a5107a5d0..fd64346df158a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -24,6 +24,7 @@ namespace llvm {
class InductionDescriptor;
class Instruction;
class LoopVersioning;
+class LoopVectorizationLegality;
class PHINode;
class ScalarEvolution;
class PredicatedScalarEvolution;
@@ -390,6 +391,13 @@ struct VPlanTransforms {
/// users in the original exit block using the VPIRInstruction wrapping to the
/// LCSSA phi.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);
+
+ /// Change FindLast reductions to save the appropriate state using selects
+ /// for entire vectors for both the latest mask containing at least one active
+ /// element and the corresponding data vector.
+ static void convertFindLastRecurrences(VPlan &Plan,
+ VPRecipeBuilder &RecipeBuilder,
+ LoopVectorizationLegality *Legal);
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 839a304904e8b..4dbcabd2db5c2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -48,7 +48,7 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {
}
bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) {
- if (isa<VPActiveLaneMaskPHIRecipe>(V))
+ if (isa<VPActiveLaneMaskPHIRecipe, VPLastActiveMaskPHIRecipe>(V))
return true;
auto IsWideCanonicalIV = [](VPValue *A) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 63eacd3d75721..55e7af00f8fc4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -374,6 +374,7 @@ class VPDef {
// VPHeaderPHIRecipe need to be kept together.
VPCanonicalIVPHISC,
VPActiveLaneMaskPHISC,
+ VPLastActiveMaskPHISC,
VPEVLBasedIVPHISC,
VPFirstOrderRecurrencePHISC,
VPWidenIntOrFpInductionSC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
new file mode 100644
index 0000000000000..25c698f3df245
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
@@ -0,0 +1,397 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes=loop-vectorize,instcombine -S < %s 2>&1 | FileCheck %s --check-prefix=NEON
+; RUN: opt -passes=loop-vectorize,instcombine -mattr=+sve -S < %s 2>&1 | FileCheck %s --check-prefix=SVE
+
+;; The following run line caused an ICE before using a dedicated FindLast PHI recipe.
+;; We're not looking at the resulting IR, just confirming it doesn't crash.
+; RUN: opt -passes=loop-vectorize,instcombine -mattr=+sve -epilogue-vectorization-force-VF=4 -S < %s 2>&1 > /dev/null
+
+target triple = "aarch64-linux-gnu"
+
+define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) {
+; NEON-LABEL: define i32 @simple_csa_int_select(
+; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) {
+; NEON-NEXT: [[ENTRY:.*]]:
+; NEON-NEXT: br label %[[LOOP:.*]]
+; NEON: [[LOOP]]:
+; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]]
+; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
+; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
+; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
+; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; NEON: [[EXIT]]:
+; NEON-NEXT: ret i32 [[SELECT_DATA]]
+;
+; SVE-LABEL: define i32 @simple_csa_int_select(
+; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; SVE-NEXT: [[ENTRY:.*]]:
+; SVE-NEXT: [[A_FR:%.*]] = freeze i32 [[A]]
+; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; SVE: [[VECTOR_PH]]:
+; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[A_FR]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SVE-NEXT: br label %[[VECTOR_BODY:.*]]
+; SVE: [[VECTOR_BODY]]:
+; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]]
+; SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
+; SVE-NEXT: [[WIDE_LOAD_FR:%.*]] = freeze <vscale x 4 x i32> [[WIDE_LOAD]]
+; SVE-NEXT: [[TMP7:%.*]] = icmp slt <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD_FR]]
+; SVE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP7]])
+; SVE-NEXT: [[TMP9]] = select i1 [[TMP8]], <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> [[LAST_ACTIVE_MASK]]
+; SVE-NEXT: [[TMP10]] = select i1 [[TMP8]], <vscale x 4 x i32> [[WIDE_LOAD_FR]], <vscale x 4 x i32> [[VEC_PHI]]
+; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; SVE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SVE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SVE: [[MIDDLE_BLOCK]]:
+; SVE-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i1> [[TMP9]], i32 -1)
+; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; SVE: [[SCALAR_PH]]:
+; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
+; SVE-NEXT: br label %[[LOOP:.*]]
+; SVE: [[LOOP]]:
+; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
+; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
+; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A_FR]], [[LD]]
+; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
+; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; SVE: [[EXIT]]:
+; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
+; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ]
+ %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
+ %ld = load i32, ptr %ld.addr, align 4
+ %select.cmp = icmp slt i32 %a, %ld
+ %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cmp = icmp eq i64 %iv.next, %N
+ br i1 %exit.cmp, label %exit, label %loop
+
+exit:
+ ret i32 %select.data
+}
+
+define ptr @simple_csa_ptr_select(i64 %N, ptr %data, i64 %a, ptr %init) {
+; NEON-LABEL: define ptr @simple_csa_ptr_select(
+; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]], ptr [[INIT:%.*]]) {
+; NEON-NEXT: [[ENTRY:.*]]:
+; NEON-NEXT: br label %[[LOOP:.*]]
+; NEON: [[LOOP]]:
+; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw ptr, ptr [[DATA]], i64 [[IV]]
+; NEON-NEXT: [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4
+; NEON-NEXT: [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64
+; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]]
+; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], ptr [[LD]], ptr [[DATA_PHI]]
+; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; NEON: [[EXIT]]:
+; NEON-NEXT: ret ptr [[SELECT_DATA]]
+;
+; SVE-LABEL: define ptr @simple_csa_ptr_select(
+; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]], ptr [[INIT:%.*]]) #[[ATTR0]] {
+; SVE-NEXT: [[ENTRY:.*]]:
+; SVE-NEXT: br label %[[LOOP:.*]]
+; SVE: [[LOOP]]:
+; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw ptr, ptr [[DATA]], i64 [[IV]]
+; SVE-NEXT: [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4
+; SVE-NEXT: [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64
+; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]]
+; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], ptr [[LD]], ptr [[DATA_PHI]]
+; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; SVE: [[EXIT]]:
+; SVE-NEXT: ret ptr [[SELECT_DATA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %data.phi = phi ptr [ %init, %entry ], [ %select.data, %loop ]
+ %ld.addr = getelementptr inbounds ptr, ptr %data, i64 %iv
+ %ld = load ptr, ptr %ld.addr, align 4
+ %ld.i64 = ptrtoint ptr %ld to i64
+ %select.cmp = icmp slt i64 %a, %ld.i64
+ %select.data = select i1 %select.cmp, ptr %ld, ptr %data.phi
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cmp = icmp eq i64 %iv.next, %N
+ br i1 %exit.cmp, label %exit, label %loop
+
+exit:
+ ret ptr %select.data
+}
+
+define float @simple_csa_float_select(i64 %N, ptr %data, float %a) {
+; NEON-LABEL: define float @simple_csa_float_select(
+; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) {
+; NEON-NEXT: [[ENTRY:.*]]:
+; NEON-NEXT: br label %[[LOOP:.*]]
+; NEON: [[LOOP]]:
+; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw float, ptr [[DATA]], i64 [[IV]]
+; NEON-NEXT: [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4
+; NEON-NEXT: [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]]
+; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]]
+; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; NEON: [[EXIT]]:
+; NEON-NEXT: ret float [[SELECT_DATA]]
+;
+; SVE-LABEL: define float @simple_csa_float_select(
+; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) #[[ATTR0]] {
+; SVE-NEXT: [[ENTRY:.*]]:
+; SVE-NEXT: br label %[[LOOP:.*]]
+; SVE: [[LOOP]]:
+; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw float, ptr [[DATA]], i64 [[IV]]
+; SVE-NEXT: [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4
+; SVE-NEXT: [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]]
+; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]]
+; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; SVE: [[EXIT]]:
+; SVE-NEXT: ret float [[SELECT_DATA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %data.phi = phi float [ -1.0, %entry ], [ %select.data, %loop ]
+ %ld.addr = getelementptr inbounds float, ptr %data, i64 %iv
+ %ld = load float, ptr %ld.addr, align 4
+ %select.cmp = fcmp olt float %a, %ld
+ %select.data = select i1 %select.cmp, float %ld, float %data.phi
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cmp = icmp eq i64 %iv.next, %N
+ br i1 %exit.cmp, label %exit, label %loop
+
+exit:
+ ret float %select.data
+}
+
+define i32 @multi_user_csa_int_select(i64 %N, ptr %data, ptr %results, i32 %a) {
+; NEON-LABEL: define i32 @multi_user_csa_int_select(
+; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], ptr [[RESULTS:%.*]], i32 [[A:%.*]]) {
+; NEON-NEXT: [[ENTRY:.*]]:
+; NEON-NEXT: br label %[[LOOP:.*]]
+; NEON: [[LOOP]]:
+; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]]
+; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
+; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
+; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
+; NEON-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[RESULTS]], i64 [[IV]]
+; NEON-NEXT: store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4
+; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; NEON: [[EXIT]]:
+; NEON-NEXT: ret i32 [[SELECT_DATA]]
+;
+; SVE-LABEL: define i32 @multi_user_csa_int_select(
+; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], ptr [[RESULTS:%.*]], i32 [[A:%.*]]) #[[ATTR0]] {
+; SVE-NEXT: [[ENTRY:.*]]:
+; SVE-NEXT: br label %[[LOOP:.*]]
+; SVE: [[LOOP]]:
+; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]]
+; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
+; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
+; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
+; SVE-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[RESULTS]], i64 [[IV]]
+; SVE-NEXT: store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4
+; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; SVE: [[EXIT]]:
+; SVE-NEXT: ret i32 [[SELECT_DATA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ]
+ %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
+ %ld = load i32, ptr %ld.addr, align 4
+ %select.cmp = icmp slt i32 %a, %ld
+ %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
+ %res.addr = getelementptr inbounds i32, ptr %results, i64 %iv
+ store i32 %select.data, ptr %res.addr, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cmp = icmp eq i64 %iv.next, %N
+ br i1 %exit.cmp, label %exit, label %loop
+
+exit:
+ ret i32 %select.data
+}
+
+
+define i32 @multi_use_cmp_for_csa_int_select(i64 %N, ptr %data, i32 %a) {
+; NEON-LABEL: define i32 @multi_use_cmp_for_csa_int_select(
+; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) {
+; NEON-NEXT: [[ENTRY:.*]]:
+; NEON-NEXT: br label %[[LOOP:.*]]
+; NEON: [[LOOP]]:
+; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[IDX_PHI:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[SELECT_IDX:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]]
+; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
+; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
+; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
+; NEON-NEXT: [[SELECT_IDX]] = select i1 [[SELECT_CMP]], i64 [[IV]], i64 [[IDX_PHI]]
+; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; NEON: [[EXIT]]:
+; NEON-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX]] to i32
+; NEON-NEXT: [[RES:%.*]] = add i32 [[SELECT_DATA]], [[IDX]]
+; NEON-NEXT: ret i32 [[RES]]
+;
+; SVE-LABEL: define i32 @multi_use_cmp_for_csa_int_select(
+; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0]] {
+; SVE-NEXT: [[ENTRY:.*]]:
+; SVE-NEXT: br label %[[LOOP:.*]]
+; SVE: [[LOOP]]:
+; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[IDX_PHI:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[SELECT_IDX:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]]
+; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
+; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
+; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
+; SVE-NEXT: [[SELECT_IDX]] = select i1 [[SELECT_CMP]], i64 [[IV]], i64 [[IDX_PHI]]
+; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; SVE: [[EXIT]]:
+; SVE-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX]] to i32
+; SVE-NEXT: [[RES:%.*]] = add i32 [[SELECT_DATA]], [[IDX]]
+; SVE-NEXT: ret i32 [[RES]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ]
+ %idx.phi = phi i64 [ -1, %entry ], [ %select.idx, %loop ]
+ %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
+ %ld = load i32, ptr %ld.addr, align 4
+ %select.cmp = icmp slt i32 %a, %ld
+ %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
+ %select.idx = select i1 %select.cmp, i64 %iv, i64 %idx.phi
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cmp = icmp eq i64 %iv.next, %N
+ br i1 %exit.cmp, label %exit, label %loop
+
+exit:
+ %idx = trunc i64 %select.idx to i32
+ %res = add i32 %idx, %select.data
+ ret i32 %res
+}
+
+
+define i32 @chained_select_for_csa_int_select(i64 %N, ptr %data1, ptr %data2, i32 %a, i32 %b) {
+; NEON-LABEL: define i32 @chained_select_for_csa_int_select(
+; NEON-SAME: i64 [[N:%.*]], ptr [[DATA1:%.*]], ptr [[DATA2:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; NEON-NEXT: [[ENTRY:.*]]:
+; NEON-NEXT: br label %[[LOOP:.*]]
+; NEON: [[LOOP]]:
+; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA1]], i64 [[IV]]
+; NEON-NEXT: [[LD1:%.*]] = load i32, ptr [[LD1_ADDR]], align 4
+; NEON-NEXT: [[SELECT_CMP1:%.*]] = icmp slt i32 [[A]], [[LD1]]
+; NEON-NEXT: [[SELECT_LD1:%.*]] = select i1 [[SELECT_CMP1]], i32 [[LD1]], i32 [[DATA_PHI]]
+; NEON-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA2]], i64 [[IV]]
+; NEON-NEXT: [[LD2:%.*]] = load i32, ptr [[LD2_ADDR]], align 4
+; NEON-NEXT: [[SELECT_CMP2:%.*]] = icmp sgt i32 [[B]], [[LD2]]
+; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP2]], i32 [[LD2]], i32 [[SELECT_LD1]]
+; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; NEON: [[EXIT]]:
+; NEON-NEXT: ret i32 [[SELECT_DATA]]
+;
+; SVE-LABEL: define i32 @chained_select_for_csa_int_select(
+; SVE-SAME: i64 [[N:%.*]], ptr [[DATA1:%.*]], ptr [[DATA2:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] {
+; SVE-NEXT: [[ENTRY:.*]]:
+; SVE-NEXT: br label %[[LOOP:.*]]
+; SVE: [[LOOP]]:
+; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA1]], i64 [[IV]]
+; SVE-NEXT: [[LD1:%.*]] = load i32, ptr [[LD1_ADDR]], align 4
+; SVE-NEXT: [[SELECT_CMP1:%.*]] = icmp slt i32 [[A]], [[LD1]]
+; SVE-NEXT: [[SELECT_LD1:%.*]] = select i1 [[SELECT_CMP1]], i32 [[LD1]], i32 [[DATA_PHI]]
+; SVE-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA2]], i64 [[IV]]
+; SVE-NEXT: [[LD2:%.*]] = load i32, ptr [[LD2_ADDR]], align 4
+; SVE-NEXT: [[SELECT_CMP2:%.*]] = icmp sgt i32 [[B]], [[LD2]]
+; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP2]], i32 [[LD2]], i32 [[SELECT_LD1]]
+; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; SVE: [[EXIT]]:
+; SVE-NEXT: ret i32 [[SELECT_DATA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ]
+ %ld1.addr = getelementptr inbounds i32, ptr %data1, i64 %iv
+ %ld1 = load i32, ptr %ld1.addr, align 4
+ %select.cmp1 = icmp slt i32 %a, %ld1
+ %select.ld1 = select i1 %select.cmp1, i32 %ld1, i32 %data.phi
+ %ld2.addr = getelementptr inbounds i32, ptr %data2, i64 %iv
+ %ld2 = load i32, ptr %ld2.addr, align 4
+ %select.cmp2 = icmp sgt i32 %b, %ld2
+ %select.data = select i1 %select.cmp2, i32 %ld2, i32 %select.ld1
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cmp = icmp eq i64 %iv.next, %N
+ br i1 %exit.cmp, label %exit, label %loop
+
+exit:
+ ret i32 %select.data
+}
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
new file mode 100644
index 0000000000000..e802093fc7886
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
@@ -0,0 +1,123 @@
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -scalable-vectorization=on -force-target-supports-scalable-vectors \
+; RUN: -disable-output 2>&1 < %s | FileCheck %s
+
+
+; This function is derived from the following C program:
+; int simple_csa_int_select(int N, int *data, int a) {
+; int t = -1;
+; for (int i = 0; i < N; i++) {
+; if (a < data[i])
+; t = data[i];
+; }
+; return t;
+; }
+define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) {
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ]
+ %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
+ %ld = load i32, ptr %ld.addr, align 4
+ %select.cmp = icmp slt i32 %a, %ld
+ %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cmp = icmp eq i64 %iv.next, %N
+ br i1 %exit.cmp, label %exit, label %loop
+
+exit:
+ ret i32 %select.data
+}
+
+
+; CHECK: VPlan 'Initial VPlan for VF={vscale x 1},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF
+; CHECK-NEXT: Live-in vp<%1> = VF * UF
+; CHECK-NEXT: Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT: Live-in ir<%N> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%9>
+; CHECK-NEXT: LAST-ACTIVE-MASK-PHI vp<%4> = phi ir<false>, vp<%8>
+; CHECK-NEXT: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0>
+; CHECK-NEXT: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5>
+; CHECK-NEXT: vp<%6> = vector-pointer ir<%ld.addr>
+; CHECK-NEXT: WIDEN ir<%ld> = load vp<%6>
+; CHECK-NEXT: WIDEN ir<%select.cmp> = icmp slt ir<%a>, ir<%ld>
+; CHECK-NEXT: EMIT vp<%7> = any-of ir<%select.cmp>
+; CHECK-NEXT: EMIT vp<%8> = select vp<%7>, ir<%select.cmp>, vp<%4>
+; CHECK-NEXT: EMIT vp<%9> = select vp<%7>, ir<%ld>, ir<%data.phi>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1>
+; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<%11> = extract-last-active vp<%9>, vp<%8>, ir<-1>
+; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%N>, vp<%2>
+; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT: IR %select.data.lcssa = phi i32 [ %select.data, %loop ] (extra operand: vp<%11> from middle.block)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<-1>, ir-bb<entry> ]
+; CHECK-NEXT: Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop>:
+; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT: IR %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT: IR %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
+; CHECK-NEXT: IR %ld = load i32, ptr %ld.addr, align 4
+; CHECK-NEXT: IR %select.cmp = icmp slt i32 %a, %ld
+; CHECK-NEXT: IR %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
+; CHECK-NEXT: IR %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT: IR %exit.cmp = icmp eq i64 %iv.next, %N
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+
+; CHECK: Cost of 1 for VF vscale x 1: induction instruction %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: exit condition instruction %exit.cmp = icmp eq i64 %iv.next, %N
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%9>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: LAST-ACTIVE-MASK-PHI vp<%4> = phi ir<false>, vp<%8>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%6> = vector-pointer ir<%ld.addr>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<%ld> = load vp<%6>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<%select.cmp> = icmp slt ir<%a>, ir<%ld>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%7> = any-of ir<%select.cmp>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%8> = select vp<%7>, ir<%select.cmp>, vp<%4>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%9> = select vp<%7>, ir<%ld>, ir<%data.phi>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: vector loop backedge
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<-1>, ir-bb<entry> ]
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %ld = load i32, ptr %ld.addr, align 4
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.cmp = icmp slt i32 %a, %ld
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %exit.cmp = icmp eq i64 %iv.next, %N
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%11> = extract-last-active vp<%9>, vp<%8>, ir<-1>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%cmp.n> = icmp eq ir<%N>, vp<%2>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.data.lcssa = phi i32 [ %select.data, %loop ] (extra operand: vp<%11> from middle.block)
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
index 2200a7d0431d2..503837894a7b4 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck --check-prefixes=CHECK,IC1VF4 %s
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck --check-prefixes=CHECK,IC4VF4 %s
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck --check-prefixes=CHECK,IC4VF1 %s
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck --check-prefix=IC1VF4 %s
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck --check-prefix=IC4VF4 %s
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck --check-prefix=IC4VF1 %s
define i64 @select_decreasing_induction_icmp_const_start(ptr %a) {
; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_const_start(
@@ -961,29 +961,142 @@ exit: ; preds = %loop
}
; The unsigned sentinel value for decreasing-IV vectorization is ULONG_MAX,
-; and since the IV hits this value, it is impossible to vectorize this case.
+; and since the IV hits this value, it cannot be vectorized as a FindLastIV
+; reduction. Instead, it is recognized and vectorized as a generic FindLast.
; In this test, %iv's range will include both signed and unsigned
; maximum (sentinel) values.
-define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start) {
-; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1
-; CHECK-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]]
-; CHECK-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1
-; CHECK-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]]
-; CHECK-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1
-; CHECK-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]]
-; CHECK-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
-; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0
-; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
-; CHECK-NEXT: ret i64 [[COND_LCSSA]]
+define i64 @select_decreasing_induction_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start) {
+; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_out_of_bound(
+; IC1VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) {
+; IC1VF4-NEXT: [[ENTRY:.*:]]
+; IC1VF4-NEXT: br label %[[VECTOR_PH:.*]]
+; IC1VF4: [[VECTOR_PH]]:
+; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; IC1VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC1VF4: [[VECTOR_BODY]]:
+; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -1, i64 -2, i64 -3, i64 -4>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
+; IC1VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; IC1VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
+; IC1VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; IC1VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -3
+; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -3
+; IC1VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
+; IC1VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC1VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]]
+; IC1VF4-NEXT: [[TMP9:%.*]] = freeze <4 x i1> [[TMP8]]
+; IC1VF4-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP9]])
+; IC1VF4-NEXT: [[TMP11]] = select i1 [[TMP10]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; IC1VF4-NEXT: [[TMP12]] = select i1 [[TMP10]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]]
+; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC1VF4-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4
+; IC1VF4-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IC1VF4: [[MIDDLE_BLOCK]]:
+; IC1VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; IC1VF4-NEXT: [[TMP15:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP12]], <4 x i1> [[TMP11]], i64 [[TMP14]])
+; IC1VF4-NEXT: br label %[[SCALAR_PH:.*]]
+; IC1VF4: [[SCALAR_PH]]:
+; IC1VF4-NEXT: br label %[[LOOP:.*]]
+; IC1VF4: [[LOOP]]:
+; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[TMP15]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ]
+; IC1VF4-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1
+; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]]
+; IC1VF4-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1
+; IC1VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]]
+; IC1VF4-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1
+; IC1VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]]
+; IC1VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; IC1VF4: [[EXIT]]:
+; IC1VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
+; IC1VF4-NEXT: ret i64 [[COND_LCSSA]]
+;
+; IC4VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_out_of_bound(
+; IC4VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) {
+; IC4VF4-NEXT: [[ENTRY:.*:]]
+; IC4VF4-NEXT: br label %[[VECTOR_PH:.*]]
+; IC4VF4: [[VECTOR_PH]]:
+; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; IC4VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC4VF4: [[VECTOR_BODY]]:
+; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -1, i64 -2, i64 -3, i64 -4>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
+; IC4VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; IC4VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
+; IC4VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; IC4VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -3
+; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -3
+; IC4VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
+; IC4VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]]
+; IC4VF4-NEXT: [[TMP9:%.*]] = freeze <4 x i1> [[TMP8]]
+; IC4VF4-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP9]])
+; IC4VF4-NEXT: [[TMP11]] = select i1 [[TMP10]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; IC4VF4-NEXT: [[TMP12]] = select i1 [[TMP10]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]]
+; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC4VF4-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4
+; IC4VF4-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IC4VF4: [[MIDDLE_BLOCK]]:
+; IC4VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; IC4VF4-NEXT: [[TMP15:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP12]], <4 x i1> [[TMP11]], i64 [[TMP14]])
+; IC4VF4-NEXT: br label %[[SCALAR_PH:.*]]
+; IC4VF4: [[SCALAR_PH]]:
+; IC4VF4-NEXT: br label %[[LOOP:.*]]
+; IC4VF4: [[LOOP]]:
+; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[TMP15]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ]
+; IC4VF4-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1
+; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]]
+; IC4VF4-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1
+; IC4VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]]
+; IC4VF4-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1
+; IC4VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]]
+; IC4VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; IC4VF4: [[EXIT]]:
+; IC4VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
+; IC4VF4-NEXT: ret i64 [[COND_LCSSA]]
+;
+; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_iv_out_of_bound(
+; IC4VF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) {
+; IC4VF1-NEXT: [[ENTRY:.*]]:
+; IC4VF1-NEXT: br label %[[LOOP:.*]]
+; IC4VF1: [[LOOP]]:
+; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1
+; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]]
+; IC4VF1-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1
+; IC4VF1-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]]
+; IC4VF1-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1
+; IC4VF1-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]]
+; IC4VF1-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; IC4VF1: [[EXIT]]:
+; IC4VF1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
+; IC4VF1-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
br label %loop
@@ -1005,26 +1118,164 @@ exit:
ret i64 %cond
}
-define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
-; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ]
-; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[RDX_START]], %[[ENTRY]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
-; CHECK-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]]
-; CHECK-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
-; CHECK-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]]
-; CHECK-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8
-; CHECK-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]]
-; CHECK-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
-; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1
-; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT:.*]]
-; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
-; CHECK-NEXT: ret i64 [[COND_LCSSA]]
+define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
+; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_non_const_start(
+; IC1VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; IC1VF4-NEXT: [[ENTRY:.*]]:
+; IC1VF4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
+; IC1VF4-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1)
+; IC1VF4-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]]
+; IC1VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4
+; IC1VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC1VF4: [[VECTOR_PH]]:
+; IC1VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
+; IC1VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; IC1VF4-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]]
+; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; IC1VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; IC1VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IC1VF4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 -1, i64 -2, i64 -3>
+; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC1VF4: [[VECTOR_BODY]]:
+; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1)
+; IC1VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
+; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC1VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; IC1VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
+; IC1VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 -3
+; IC1VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
+; IC1VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC1VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]]
+; IC1VF4-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP11]]
+; IC1VF4-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]])
+; IC1VF4-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; IC1VF4-NEXT: [[TMP15]] = select i1 [[TMP13]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
+; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC1VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC1VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IC1VF4: [[MIDDLE_BLOCK]]:
+; IC1VF4-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; IC1VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP15]], <4 x i1> [[TMP14]], i64 [[TMP17]])
+; IC1VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; IC1VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC1VF4: [[SCALAR_PH]]:
+; IC1VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ]
+; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; IC1VF4-NEXT: br label %[[LOOP:.*]]
+; IC1VF4: [[LOOP]]:
+; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; IC1VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]]
+; IC1VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
+; IC1VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]]
+; IC1VF4-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8
+; IC1VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]]
+; IC1VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1
+; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; IC1VF4: [[EXIT]]:
+; IC1VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ]
+; IC1VF4-NEXT: ret i64 [[COND_LCSSA]]
+;
+; IC4VF4-LABEL: define i64 @select_decreasing_induction_icmp_non_const_start(
+; IC4VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; IC4VF4-NEXT: [[ENTRY:.*]]:
+; IC4VF4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
+; IC4VF4-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1)
+; IC4VF4-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]]
+; IC4VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4
+; IC4VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC4VF4: [[VECTOR_PH]]:
+; IC4VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
+; IC4VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; IC4VF4-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]]
+; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; IC4VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; IC4VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IC4VF4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 -1, i64 -2, i64 -3>
+; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC4VF4: [[VECTOR_BODY]]:
+; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1)
+; IC4VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
+; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; IC4VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
+; IC4VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 -3
+; IC4VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
+; IC4VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]]
+; IC4VF4-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP11]]
+; IC4VF4-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]])
+; IC4VF4-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; IC4VF4-NEXT: [[TMP15]] = select i1 [[TMP13]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
+; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC4VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC4VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IC4VF4: [[MIDDLE_BLOCK]]:
+; IC4VF4-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; IC4VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP15]], <4 x i1> [[TMP14]], i64 [[TMP17]])
+; IC4VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; IC4VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC4VF4: [[SCALAR_PH]]:
+; IC4VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ]
+; IC4VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; IC4VF4-NEXT: br label %[[LOOP:.*]]
+; IC4VF4: [[LOOP]]:
+; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; IC4VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]]
+; IC4VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
+; IC4VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]]
+; IC4VF4-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8
+; IC4VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]]
+; IC4VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1
+; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; IC4VF4: [[EXIT]]:
+; IC4VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ]
+; IC4VF4-NEXT: ret i64 [[COND_LCSSA]]
+;
+; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_non_const_start(
+; IC4VF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; IC4VF1-NEXT: [[ENTRY:.*]]:
+; IC4VF1-NEXT: br label %[[LOOP:.*]]
+; IC4VF1: [[LOOP]]:
+; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ]
+; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; IC4VF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]]
+; IC4VF1-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
+; IC4VF1-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]]
+; IC4VF1-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8
+; IC4VF1-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]]
+; IC4VF1-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1
+; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; IC4VF1: [[EXIT]]:
+; IC4VF1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
+; IC4VF1-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
index 21ef1885b75b9..18f1470aba3a5 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
@@ -145,10 +145,44 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) {
; CHECK-LABEL: define i64 @select_icmp_nuw(
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[II]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP6]], <4 x i1> [[TMP5]], i64 [[TMP8]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ]
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[II]], %[[ENTRY]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
@@ -157,9 +191,9 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) {
; CHECK-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
; CHECK-NEXT: [[INC]] = add nuw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
@@ -186,10 +220,44 @@ define i64 @select_icmp_noflag(ptr %a, ptr %b, i64 %ii, i64 %n) {
; CHECK-LABEL: define i64 @select_icmp_noflag(
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[II]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP6]], <4 x i1> [[TMP5]], i64 [[TMP8]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ]
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[II]], %[[ENTRY]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
@@ -198,9 +266,9 @@ define i64 @select_icmp_noflag(ptr %a, ptr %b, i64 %ii, i64 %n) {
; CHECK-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
; CHECK-NEXT: [[INC]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
@@ -229,4 +297,8 @@ exit: ; preds = %for.body
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
index 72ed6537ef640..7a89c32b197d3 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
@@ -1,32 +1,156 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC4
define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %iv_start ,i64 %n) {
-; CHECK-LABEL: define i64 @select_non_const_iv_start_signed_guard(
-; CHECK-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]]
-; CHECK-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
-; CHECK: [[FOR_BODY_PREHEADER]]:
-; CHECK-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3
-; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
-; CHECK: [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT: br label %[[EXIT]]
-; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
-; CHECK-NEXT: ret i64 [[IDX_0_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i64 @select_non_const_iv_start_signed_guard(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]]
+; CHECK-VF4IC1-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_START]]
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3)
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]])
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1: [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP11]], 3
+; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]]
+; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]:
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT: br label %[[EXIT]]
+; CHECK-VF4IC1: [[EXIT]]:
+; CHECK-VF4IC1-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF4IC1-NEXT: ret i64 [[IDX_0_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i64 @select_non_const_iv_start_signed_guard(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]]
+; CHECK-VF4IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_START]]
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3)
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]])
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4: [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP11]], 3
+; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]]
+; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]:
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT: br label %[[EXIT]]
+; CHECK-VF4IC4: [[EXIT]]:
+; CHECK-VF4IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF4IC4-NEXT: ret i64 [[IDX_0_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i64 @select_non_const_iv_start_signed_guard(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]]
+; CHECK-VF1IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF1IC4: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4: [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF1IC4-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3
+; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]]
+; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4: [[EXIT_LOOPEXIT]]:
+; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: br label %[[EXIT]]
+; CHECK-VF1IC4: [[EXIT]]:
+; CHECK-VF1IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF1IC4-NEXT: ret i64 [[IDX_0_LCSSA]]
;
entry:
%guard = icmp slt i64 %iv_start, %n
@@ -49,32 +173,162 @@ exit:
}
define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start, i32 %iv_start ,i32 %n) {
-; CHECK-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard(
-; CHECK-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]]
-; CHECK-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
-; CHECK: [[FOR_BODY_PREHEADER]]:
-; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64
-; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64
-; CHECK-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT: [[RDX_07:%.*]] = phi i32 [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 3
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[RDX_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
-; CHECK: [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT: br label %[[EXIT]]
-; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
-; CHECK-NEXT: ret i32 [[IDX_0_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]]
+; CHECK-VF4IC1-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64
+; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64
+; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[RDX_START]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP8]], <4 x i1> [[TMP7]], i32 [[TMP10]])
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1: [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[RDX_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP12]], 3
+; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP13]], i32 [[RDX_07]]
+; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]:
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT: br label %[[EXIT]]
+; CHECK-VF4IC1: [[EXIT]]:
+; CHECK-VF4IC1-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF4IC1-NEXT: ret i32 [[IDX_0_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]]
+; CHECK-VF4IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64
+; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64
+; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[RDX_START]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP8]], <4 x i1> [[TMP7]], i32 [[TMP10]])
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4: [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[RDX_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP12]], 3
+; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP13]], i32 [[RDX_07]]
+; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]:
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT: br label %[[EXIT]]
+; CHECK-VF4IC4: [[EXIT]]:
+; CHECK-VF4IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF4IC4-NEXT: ret i32 [[IDX_0_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]]
+; CHECK-VF1IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF1IC4: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64
+; CHECK-VF1IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64
+; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4: [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[RDX_07:%.*]] = phi i32 [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 3
+; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[RDX_07]]
+; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4: [[EXIT_LOOPEXIT]]:
+; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: br label %[[EXIT]]
+; CHECK-VF1IC4: [[EXIT]]:
+; CHECK-VF1IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF1IC4-NEXT: ret i32 [[IDX_0_LCSSA]]
;
entry:
%guard = icmp slt i32 %iv_start, %n
@@ -101,3 +355,18 @@ exit:
%idx.0.lcssa = phi i32 [ %rdx_start, %entry ], [ %cond, %for.body ]
ret i32 %idx.0.lcssa
}
+;.
+; CHECK-VF4IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF4IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF4IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF4IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
+; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF4IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF4IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
index 45c2abd43c36a..839ea7ce7e7a4 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
@@ -674,65 +674,125 @@ exit: ; preds = %for.body
; Negative tests
-; This test can theoretically be vectorized, but only with a runtime-check.
-; The construct that are introduced by IndVarSimplify is:
+; This test can theoretically be vectorized as a FindLastIV reduction, but only
+; with a runtime-check. It will vectorize as a generic FindLast reduction.
+;
+; For FindLastIV, the construct that are introduced by IndVarSimplify is:
; %1 = trunc i64 %iv to i32
; However, the loop guard is an i64:
; %cmp.sgt = icmp sgt i64 %n, 0
; We cannot guarantee that %iv won't overflow an i32 value (and hence hit the
; sentinel value), and need a runtime-check to vectorize this case.
-define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) {
-; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(
+define i32 @select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) {
+; CHECK-VF4IC1-LABEL: define i32 @select_icmp_const_truncated_iv_unwidened_exit(
; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
; CHECK-VF4IC1-NEXT: [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0
; CHECK-VF4IC1-NEXT: br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331)
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32
; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
-; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]:
-; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: br label %[[EXIT]]
; CHECK-VF4IC1: [[EXIT]]:
; CHECK-VF4IC1-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
; CHECK-VF4IC1-NEXT: ret i32 [[RDX_LCSSA]]
;
-; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(
+; CHECK-VF4IC4-LABEL: define i32 @select_icmp_const_truncated_iv_unwidened_exit(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
; CHECK-VF4IC4-NEXT: [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0
; CHECK-VF4IC4-NEXT: br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331)
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP8]], 3
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP9]], i32 [[RDX]]
; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]:
-; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: br label %[[EXIT]]
; CHECK-VF4IC4: [[EXIT]]:
; CHECK-VF4IC4-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
; CHECK-VF4IC4-NEXT: ret i32 [[RDX_LCSSA]]
;
-; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(
+; CHECK-VF1IC4-LABEL: define i32 @select_icmp_const_truncated_iv_unwidened_exit(
; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]:
; CHECK-VF1IC4-NEXT: [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0
@@ -778,67 +838,127 @@ exit: ; preds = %for.body, %entry
ret i32 %rdx.lcssa
}
-; This test can theoretically be vectorized, but only with a runtime-check.
-; The construct that are introduced by IndVarSimplify is:
+; This test can theoretically be vectorized as a FindLastIV reduction, but only
+; with a runtime-check. It will vectorize as a generic FindLast reduction.
+;
+; For FindLastIV, the construct that are introduced by IndVarSimplify is:
; %1 = trunc i64 %iv to i32
; However, the loop guard is unsigned:
; %cmp.not = icmp eq i32 %n, 0
; We cannot guarantee that %iv won't overflow an i32 value (and hence hit the
; sentinel value), and need a runtime-check to vectorize this case.
-define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) {
-; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(
+define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) {
+; CHECK-VF4IC1-LABEL: define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard(
; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) {
; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
; CHECK-VF4IC1-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0
; CHECK-VF4IC1-NEXT: br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]]
; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]:
; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331)
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32
; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]]
-; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]:
-; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: br label %[[EXIT]]
; CHECK-VF4IC1: [[EXIT]]:
; CHECK-VF4IC1-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
; CHECK-VF4IC1-NEXT: ret i32 [[RDX_LCSSA]]
;
-; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(
+; CHECK-VF4IC4-LABEL: define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) {
; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
; CHECK-VF4IC4-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0
; CHECK-VF4IC4-NEXT: br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]]
; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]:
; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331)
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]]
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP8]], 3
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP9]], i32 [[RDX]]
; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]:
-; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: br label %[[EXIT]]
; CHECK-VF4IC4: [[EXIT]]:
; CHECK-VF4IC4-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
; CHECK-VF4IC4-NEXT: ret i32 [[RDX_LCSSA]]
;
-; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(
+; CHECK-VF1IC4-LABEL: define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard(
; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) {
; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]:
; CHECK-VF1IC4-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0
@@ -899,41 +1019,61 @@ exit: ; preds = %for.body, %entry
define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) {
; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(
; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) {
-; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
-; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]]
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 -2, i32 -1, i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = add i64 4294967294, [[INDEX]]
; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-VF4IC1-NEXT: [[CONV:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
-; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512
+; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331)
+; CHECK-VF4IC1-NEXT: br label %[[EXIT:.*]]
; CHECK-VF4IC1: [[EXIT]]:
-; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF4IC1-NEXT: ret i32 [[TMP7]]
;
; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) {
-; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
-; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ENTRY:.*:]]
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 -2, i32 -1, i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = add i64 4294967294, [[INDEX]]
; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-VF4IC4-NEXT: [[CONV:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
-; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512
+; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331)
+; CHECK-VF4IC4-NEXT: br label %[[EXIT:.*]]
; CHECK-VF4IC4: [[EXIT]]:
-; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF4IC4-NEXT: ret i32 [[TMP7]]
;
; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(
; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) {
@@ -980,44 +1120,112 @@ define i32 @not_vectorized_select_iv_icmp_no_guard(ptr %a, ptr %b, i32 %start, i
; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[START:%.*]], i32 [[N:%.*]]) {
; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]]
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 [[TMP8]])
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]]
; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
-; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i32
; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[RDX]]
-; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK-VF4IC1: [[EXIT]]:
-; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: ret i32 [[COND_LCSSA]]
;
; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_iv_icmp_no_guard(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[START:%.*]], i32 [[N:%.*]]) {
; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]]
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 [[TMP8]])
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]]
; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
-; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i32
; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[RDX]]
-; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK-VF4IC4: [[EXIT]]:
-; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: ret i32 [[COND_LCSSA]]
;
; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_iv_icmp_no_guard(
@@ -1071,38 +1279,84 @@ exit: ; preds = %for.body
define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) {
; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub(
; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) {
-; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]]
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
+; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 -1)
+; CHECK-VF4IC1-NEXT: br label %[[SCALAR_PH:.*]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ 2147483648, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[TMP7]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00
-; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32
; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
-; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK-VF4IC1: [[EXIT]]:
; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
; CHECK-VF4IC1-NEXT: ret i32 [[SPEC_SELECT_LCSSA]]
;
; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) {
-; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT: [[ENTRY:.*:]]
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
+; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 -1)
+; CHECK-VF4IC4-NEXT: br label %[[SCALAR_PH:.*]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ 2147483648, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[TMP7]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00
-; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32
; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
-; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK-VF4IC4: [[EXIT]]:
; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
; CHECK-VF4IC4-NEXT: ret i32 [[SPEC_SELECT_LCSSA]]
@@ -1156,22 +1410,56 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p
; CHECK-VF4IC1-NEXT: br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]:
; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[START]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i16> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]]
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> [[TMP6]], <4 x i1> [[TMP5]], i16 [[TMP8]])
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i16 [ [[START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]]
; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
; CHECK-VF4IC1-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
-; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i16
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i16
; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP3]], i16 [[TMP2]], i16 [[RDX]]
-; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]:
-; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: br label %[[EXIT]]
; CHECK-VF4IC1: [[EXIT]]:
; CHECK-VF4IC1-NEXT: [[RDX_0_LCSSA:%.*]] = phi i16 [ [[START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
@@ -1184,22 +1472,56 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p
; CHECK-VF4IC4-NEXT: br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]:
; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[START]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i16> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]]
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> [[TMP6]], <4 x i1> [[TMP5]], i16 [[TMP8]])
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i16 [ [[START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]]
; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
; CHECK-VF4IC4-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
-; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i16
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i16
; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP3]], i16 [[TMP2]], i16 [[RDX]]
-; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]:
-; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: br label %[[EXIT]]
; CHECK-VF4IC4: [[EXIT]]:
; CHECK-VF4IC4-NEXT: [[RDX_0_LCSSA:%.*]] = phi i16 [ [[START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
index a071949f82062..6001ee32ca62a 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
@@ -1948,16 +1948,52 @@ exit: ; preds = %for.body
}
; The sentinel value for increasing-IV vectorization is -LONG_MAX, and since
-; the IV hits this value, it is impossible to vectorize this case.
+; the IV hits this value, it is vectorized as a generic last-active reduction.
define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound(
; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = add i64 -9223372036854775808, [[N_VEC]]
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775807, i64 -9223372036854775806, i64 -9223372036854775805>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]])
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
@@ -1967,19 +2003,55 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV_I]], 1
; CHECK-VF4IC1-NEXT: [[INC3]] = add nsw i64 [[IV_J]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-VF4IC1: [[EXIT]]:
-; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: ret i64 [[COND_LCSSA]]
;
; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = add i64 -9223372036854775808, [[N_VEC]]
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775807, i64 -9223372036854775806, i64 -9223372036854775805>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]])
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
@@ -1989,9 +2061,9 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV_I]], 1
; CHECK-VF4IC4-NEXT: [[INC3]] = add nsw i64 [[IV_J]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-VF4IC4: [[EXIT]]:
-; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: ret i64 [[COND_LCSSA]]
;
; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound(
@@ -2042,10 +2114,50 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value(
; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[IVSTART]]
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP12]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP12]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP12]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = add i64 [[IVSTART]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IVSTART]], [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP8]], <4 x i1> [[TMP7]], i64 [[TMP10]])
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
@@ -2054,18 +2166,58 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK-VF4IC1: [[EXIT]]:
-; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: ret i64 [[COND_LCSSA]]
;
; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[IVSTART]]
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP12]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP12]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP12]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = add i64 [[IVSTART]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IVSTART]], [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP8]], <4 x i1> [[TMP7]], i64 [[TMP10]])
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
@@ -2074,9 +2226,9 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK-VF4IC4: [[EXIT]]:
-; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: ret i64 [[COND_LCSSA]]
;
; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value(
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
index 2b352abe9f7a1..e19ebb9a3251c 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -1128,27 +1128,124 @@ exit: ; preds = %loop
ret float %sel
}
-; We don't support selecting loop-variant values.
define i32 @select_variant_i32_from_icmp(ptr %v1, ptr %v2, i64 %n) {
-; CHECK-LABEL: define i32 @select_variant_i32_from_icmp(
-; CHECK-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ 3, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]]
-; CHECK-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4
-; CHECK-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]]
-; CHECK-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4
-; CHECK-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3
-; CHECK-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ]
-; CHECK-NEXT: ret i32 [[SEL_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i32 @select_variant_i32_from_icmp(
+; CHECK-VF4IC1-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 3), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD1]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 3)
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: br label %[[LOOP:.*]]
+; CHECK-VF4IC1: [[LOOP]]:
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4
+; CHECK-VF4IC1-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4
+; CHECK-VF4IC1-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3
+; CHECK-VF4IC1-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]]
+; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-VF4IC1: [[EXIT]]:
+; CHECK-VF4IC1-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT: ret i32 [[SEL_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @select_variant_i32_from_icmp(
+; CHECK-VF4IC4-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 3), %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD9]], splat (i32 3)
+; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP11]]
+; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
+; CHECK-VF4IC4-NEXT: [[TMP23]] = select i1 [[TMP19]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP27]] = select i1 [[TMP19]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD13]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP27]], <4 x i1> [[TMP23]], i32 3)
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: br label %[[LOOP:.*]]
+; CHECK-VF4IC4: [[LOOP]]:
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4
+; CHECK-VF4IC4-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4
+; CHECK-VF4IC4-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3
+; CHECK-VF4IC4-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]]
+; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-VF4IC4: [[EXIT]]:
+; CHECK-VF4IC4-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT: ret i32 [[SEL_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @select_variant_i32_from_icmp(
+; CHECK-VF1IC4-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT: br label %[[LOOP:.*]]
+; CHECK-VF1IC4: [[LOOP]]:
+; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ 3, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4
+; CHECK-VF1IC4-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4
+; CHECK-VF1IC4-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3
+; CHECK-VF1IC4-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]]
+; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF1IC4: [[EXIT]]:
+; CHECK-VF1IC4-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT: ret i32 [[SEL_LCSSA]]
;
entry:
br label %loop
@@ -1220,6 +1317,8 @@ exit: ; preds = %loop
; CHECK-VF4IC1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
; CHECK-VF4IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
; CHECK-VF4IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
;.
; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -1235,6 +1334,8 @@ exit: ; preds = %loop
; CHECK-VF4IC4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
; CHECK-VF4IC4: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
; CHECK-VF4IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
;.
; CHECK-VF1IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK-VF1IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
>From 94fbb30bdeca3387b938aa01276f4af53d109259 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 11 Nov 2025 10:26:04 +0000
Subject: [PATCH 02/21] Apply suggestion from @MacDue
Co-authored-by: Benjamin Maxwell <macdue at dueutil.tech>
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 633d200e126c2..79781720affd5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1293,7 +1293,6 @@ class LoopVectorizationCostModel {
"from latch block\n");
return true;
}
-
if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
"interleaved group requires scalar epilogue\n");
>From d4ad7bee52fabde80e6530dedd22320ba48fa71f Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 11 Nov 2025 16:02:27 +0000
Subject: [PATCH 03/21] Remove LVL param
---
.../Transforms/Vectorize/LoopVectorize.cpp | 6 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 101 +++++++++---------
.../Transforms/Vectorize/VPlanTransforms.h | 4 +-
3 files changed, 51 insertions(+), 60 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 79781720affd5..dd043f0950fbe 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8599,7 +8599,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// Create whole-vector selects for find-last recurrences.
VPlanTransforms::runPass(VPlanTransforms::convertFindLastRecurrences, *Plan,
- RecipeBuilder, Legal);
+ RecipeBuilder);
if (useActiveLaneMask(Style)) {
// TODO: Move checks to VPlanTransforms::addActiveLaneMask once
@@ -9603,10 +9603,6 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
continue;
}
}
- } else if (isa<VPLastActiveMaskPHIRecipe>(R)) {
- // LastActiveMasks are only used as part of FindLast reductions,
- // and aren't passed to the scalar loop.
- continue;
} else {
// Retrieve the induction resume values for wide inductions from
// their original phi nodes in the scalar loop.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d72b6360ee541..1ec2ede0676fb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -41,7 +41,6 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/TypeSize.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
using namespace llvm;
using namespace VPlanPatternMatch;
@@ -4905,9 +4904,7 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
}
void VPlanTransforms::convertFindLastRecurrences(
- VPlan &Plan, VPRecipeBuilder &RecipeBuilder,
- LoopVectorizationLegality *Legal) {
- assert(Legal && "Need valid LoopVecLegality");
+ VPlan &Plan, VPRecipeBuilder &RecipeBuilder) {
// May need to do something better than this?
if (Plan.hasScalarVFOnly())
@@ -4928,55 +4925,55 @@ void VPlanTransforms::convertFindLastRecurrences(
// middle.block:
// result = extract-last-active new.data, new.mask, default.val
- for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
- if (RecurrenceDescriptor::isFindLastRecurrenceKind(
- RdxDesc.getRecurrenceKind())) {
- VPRecipeBase *PhiR = RecipeBuilder.getRecipe(Phi);
- VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR);
-
- // Add mask phi
- VPValue *False =
- Plan.getOrAddLiveIn(ConstantInt::getFalse(Phi->getContext()));
- auto *MaskPHI = new VPLastActiveMaskPHIRecipe(False, DebugLoc());
- Builder.insert(MaskPHI);
-
- // Find the condition for the select
- SelectInst *Select = cast<SelectInst>(RdxDesc.getLoopExitInstr());
- auto *SR = cast<VPWidenSelectRecipe>(RecipeBuilder.getRecipe(Select));
- VPValue *Cond = SR->getCond();
-
- // Add select for mask
- Builder.setInsertPoint(SR);
- VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond});
- VPValue *MaskSelect = Builder.createSelect(AnyOf, Cond, MaskPHI);
- MaskPHI->addOperand(MaskSelect);
-
- // Replace select for data
- VPValue *DataSelect = Builder.createSelect(
- AnyOf, SR->getOperand(1), SR->getOperand(2), SR->getDebugLoc());
- SR->replaceAllUsesWith(DataSelect);
- SR->eraseFromParent();
-
- // Find final reduction and replace it with an
- // extract.last.active intrinsic.
- VPInstruction *RdxResult = nullptr;
- for (VPUser *U : DataSelect->users()) {
- VPInstruction *I = dyn_cast<VPInstruction>(U);
- if (I && I->getOpcode() == VPInstruction::ComputeReductionResult) {
- RdxResult = I;
- break;
- }
- }
+ for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+ auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
+ if (!PhiR || !RecurrenceDescriptor::isFindLastRecurrenceKind(
+ PhiR->getRecurrenceKind()))
+ continue;
- assert(RdxResult);
- Builder.setInsertPoint(RdxResult);
- VPValue *Default = RecipeBuilder.getVPValueOrAddLiveIn(
- RdxDesc.getRecurrenceStartValue());
- auto *ExtractLastActive = Builder.createNaryOp(
- VPInstruction::ExtractLastActive, {DataSelect, MaskSelect, Default},
- RdxResult->getDebugLoc());
- RdxResult->replaceAllUsesWith(ExtractLastActive);
- RdxResult->eraseFromParent();
+ // Find the condition for the select
+ auto *SR = dyn_cast<VPWidenSelectRecipe>(&PhiR->getBackedgeRecipe());
+ if (!SR)
+ continue;
+ VPValue *Cond = SR->getCond();
+
+ // Add mask phi
+ VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR);
+ VPValue *False = Plan.getOrAddLiveIn(
+ ConstantInt::getFalse(PhiR->getUnderlyingValue()->getContext()));
+ auto *MaskPHI = new VPLastActiveMaskPHIRecipe(False, DebugLoc());
+ Builder.insert(MaskPHI);
+
+ // Add select for mask
+ Builder.setInsertPoint(SR);
+ VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond});
+ VPValue *MaskSelect = Builder.createSelect(AnyOf, Cond, MaskPHI);
+ MaskPHI->addOperand(MaskSelect);
+
+ // Replace select for data
+ VPValue *DataSelect = Builder.createSelect(
+ AnyOf, SR->getOperand(1), SR->getOperand(2), SR->getDebugLoc());
+ SR->replaceAllUsesWith(DataSelect);
+ SR->eraseFromParent();
+
+ // Find final reduction and replace it with an
+ // extract.last.active intrinsic.
+ VPInstruction *RdxResult = nullptr;
+ for (VPUser *U : DataSelect->users()) {
+ VPInstruction *I = dyn_cast<VPInstruction>(U);
+ if (I && I->getOpcode() == VPInstruction::ComputeReductionResult) {
+ RdxResult = I;
+ break;
+ }
}
+
+ assert(RdxResult);
+ Builder.setInsertPoint(RdxResult);
+ auto *ExtractLastActive =
+ Builder.createNaryOp(VPInstruction::ExtractLastActive,
+ {DataSelect, MaskSelect, PhiR->getStartValue()},
+ RdxResult->getDebugLoc());
+ RdxResult->replaceAllUsesWith(ExtractLastActive);
+ RdxResult->eraseFromParent();
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index fd64346df158a..39c87ae76941a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -24,7 +24,6 @@ namespace llvm {
class InductionDescriptor;
class Instruction;
class LoopVersioning;
-class LoopVectorizationLegality;
class PHINode;
class ScalarEvolution;
class PredicatedScalarEvolution;
@@ -396,8 +395,7 @@ struct VPlanTransforms {
/// for entire vectors for both the latest mask containing at least one active
/// element and the corresponding data vector.
static void convertFindLastRecurrences(VPlan &Plan,
- VPRecipeBuilder &RecipeBuilder,
- LoopVectorizationLegality *Legal);
+ VPRecipeBuilder &RecipeBuilder);
};
} // namespace llvm
>From 3f8123be0adadd2f63479bb18493d8ec04a5186b Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 12 Nov 2025 11:40:02 +0000
Subject: [PATCH 04/21] Remove VPLastActiveMaskPHIRecipe
---
llvm/lib/Analysis/IVDescriptors.cpp | 8 ----
.../Transforms/Vectorize/LoopVectorize.cpp | 2 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 43 +++----------------
.../Transforms/Vectorize/VPlanAnalysis.cpp | 16 +++----
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 26 +++--------
.../Transforms/Vectorize/VPlanTransforms.cpp | 2 +-
llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 2 +-
.../conditional-scalar-assignment-vplan.ll | 4 +-
8 files changed, 25 insertions(+), 78 deletions(-)
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index b34e204dddd0f..3fbbcddcc91c1 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -728,14 +728,6 @@ RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop,
// FIXME: Support more complex patterns, including multiple selects.
// The Select must be used only outside the loop and by the PHI.
- for (User *U : I->users()) {
- if (U == OrigPhi)
- continue;
- if (auto *UI = dyn_cast<Instruction>(U); UI && !TheLoop->contains(UI))
- continue;
- return InstDesc(false, I);
- }
-
return InstDesc(I, RecurKind::FindLast);
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index dd043f0950fbe..e1064b605bbe8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9406,7 +9406,7 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
for (VPRecipeBase &R :
EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
- if (isa<VPCanonicalIVPHIRecipe, VPLastActiveMaskPHIRecipe>(&R))
+ if (isa<VPCanonicalIVPHIRecipe>(&R))
continue;
EpiWidenedPhis.insert(
cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1081efa21dbb3..b84b5cc424ec3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2336,8 +2336,9 @@ class LLVM_ABI_FOR_TEST VPWidenPHIRecipe : public VPSingleDefRecipe,
}
VPWidenPHIRecipe *clone() override {
- auto *C = new VPWidenPHIRecipe(cast<PHINode>(getUnderlyingValue()),
- getOperand(0), getDebugLoc(), Name);
+ auto *C =
+ new VPWidenPHIRecipe(cast_if_present<PHINode>(getUnderlyingValue()),
+ getOperand(0), getDebugLoc(), Name);
for (VPValue *Op : llvm::drop_begin(operands()))
C->addOperand(Op);
return C;
@@ -2350,6 +2351,10 @@ class LLVM_ABI_FOR_TEST VPWidenPHIRecipe : public VPSingleDefRecipe,
/// Generate the phi/select nodes.
void execute(VPTransformState &State) override;
+ /// Return the cost of this VPWidenPHIRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
protected:
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
@@ -3642,40 +3647,6 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
#endif
};
-// TODO: Can we unify the PHI recipe hierarchy a bit? VPPredInstPHISC is close
-// to this (just a PHI of a predicate), but isn't a header phi so can't
-// be used for the mask of FindLastActive reductions.
-//
-// This is basically a clone of VPActiveLaneMaskPHIRecipe, but won't run into
-// problems with transforms that expect there to only be a single ALM PHI, and
-// can be ignored by other code looking for a (non-existent) underlying value.
-class VPLastActiveMaskPHIRecipe : public VPHeaderPHIRecipe {
-public:
- VPLastActiveMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)
- : VPHeaderPHIRecipe(VPDef::VPLastActiveMaskPHISC, nullptr, StartMask,
- DL) {}
-
- ~VPLastActiveMaskPHIRecipe() override = default;
-
- VPLastActiveMaskPHIRecipe *clone() override {
- auto *R = new VPLastActiveMaskPHIRecipe(getOperand(0), getDebugLoc());
- if (getNumOperands() == 2)
- R->addOperand(getOperand(1));
- return R;
- }
-
- VP_CLASSOF_IMPL(VPDef::VPLastActiveMaskPHISC);
-
- /// Generate the mask phi
- void execute(VPTransformState &State) override;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- /// Print the recipe
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-#endif
-};
-
/// A recipe for generating the phi node for the current index of elements,
/// adjusted in accordance with EVL value. It starts at the start value of the
/// canonical induction and gets incremented by EVL in each iteration of the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 3168d6652f03b..eada2c65bc820 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -277,14 +277,14 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
.Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe,
VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe,
- VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe,
- VPLastActiveMaskPHIRecipe>([this](const auto *R) {
- // Handle header phi recipes, except VPWidenIntOrFpInduction
- // which needs special handling due it being possibly truncated.
- // TODO: consider inferring/caching type of siblings, e.g.,
- // backedge value, here and in cases below.
- return inferScalarType(R->getStartValue());
- })
+ VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe>(
+ [this](const auto *R) {
+ // Handle header phi recipes, except VPWidenIntOrFpInduction
+ // which needs special handling due it being possibly truncated.
+ // TODO: consider inferring/caching type of siblings, e.g.,
+ // backedge value, here and in cases below.
+ return inferScalarType(R->getStartValue());
+ })
.Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
[](const auto *R) { return R->getScalarType(); })
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 13a802c5287d5..b17768cf830c5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -4493,6 +4493,11 @@ void VPWidenPHIRecipe::execute(VPTransformState &State) {
State.set(this, VecPhi);
}
+InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
@@ -4527,27 +4532,6 @@ void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
}
#endif
-void VPLastActiveMaskPHIRecipe::execute(VPTransformState &State) {
- BasicBlock *VectorPH =
- State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
- Value *StartMask = State.get(getOperand(0));
- PHINode *Phi =
- State.Builder.CreatePHI(StartMask->getType(), 2, "last.active.mask");
- Phi->addIncoming(StartMask, VectorPH);
- State.set(this, Phi);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPLastActiveMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "LAST-ACTIVE-MASK-PHI ";
-
- printAsOperand(O, SlotTracker);
- O << " = phi ";
- printOperands(O, SlotTracker);
-}
-#endif
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPEVLBasedIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1ec2ede0676fb..77d5ee0f263e6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4941,7 +4941,7 @@ void VPlanTransforms::convertFindLastRecurrences(
VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR);
VPValue *False = Plan.getOrAddLiveIn(
ConstantInt::getFalse(PhiR->getUnderlyingValue()->getContext()));
- auto *MaskPHI = new VPLastActiveMaskPHIRecipe(False, DebugLoc());
+ auto *MaskPHI = new VPWidenPHIRecipe(nullptr, False, DebugLoc());
Builder.insert(MaskPHI);
// Add select for mask
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 4dbcabd2db5c2..839a304904e8b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -48,7 +48,7 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {
}
bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) {
- if (isa<VPActiveLaneMaskPHIRecipe, VPLastActiveMaskPHIRecipe>(V))
+ if (isa<VPActiveLaneMaskPHIRecipe>(V))
return true;
auto IsWideCanonicalIV = [](VPValue *A) {
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
index e802093fc7886..6d63e2f927df1 100644
--- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
@@ -48,7 +48,7 @@ exit:
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%9>
-; CHECK-NEXT: LAST-ACTIVE-MASK-PHI vp<%4> = phi ir<false>, vp<%8>
+; CHECK-NEXT: WIDEN-PHI vp<%4> = phi [ ir<false>, vector.ph ], [ vp<%8>, vector.body ]
; CHECK-NEXT: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0>
; CHECK-NEXT: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5>
; CHECK-NEXT: vp<%6> = vector-pointer ir<%ld.addr>
@@ -95,7 +95,7 @@ exit:
; CHECK-NEXT: Cost of 1 for VF vscale x 1: exit condition instruction %exit.cmp = icmp eq i64 %iv.next, %N
; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%9>
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: LAST-ACTIVE-MASK-PHI vp<%4> = phi ir<false>, vp<%8>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-PHI vp<%4> = phi [ ir<false>, vector.ph ], [ vp<%8>, vector.body ]
; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0>
; CHECK-NEXT: Cost of 0 for VF vscale x 1: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5>
; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%6> = vector-pointer ir<%ld.addr>
>From e0609ea41ac67c3b0471b5ac77d4d671948deed1 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 12 Nov 2025 14:25:37 +0000
Subject: [PATCH 05/21] Add regex matches to vplan test
---
.../conditional-scalar-assignment-vplan.ll | 122 +++++++++---------
1 file changed, 61 insertions(+), 61 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
index 6d63e2f927df1..23964f65b7aae 100644
--- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
@@ -33,10 +33,10 @@ exit:
; CHECK: VPlan 'Initial VPlan for VF={vscale x 1},UF>=1' {
-; CHECK-NEXT: Live-in vp<%0> = VF
-; CHECK-NEXT: Live-in vp<%1> = VF * UF
-; CHECK-NEXT: Live-in vp<%2> = vector-trip-count
-; CHECK-NEXT: Live-in ir<%N> = original trip-count
+; CHECK-NEXT: Live-in vp<[[VF:%.*]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.*]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VECTC:%.*]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<[[ORIGTC:%.*]]> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
@@ -46,78 +46,78 @@ exit:
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%9>
-; CHECK-NEXT: WIDEN-PHI vp<%4> = phi [ ir<false>, vector.ph ], [ vp<%8>, vector.body ]
-; CHECK-NEXT: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0>
-; CHECK-NEXT: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5>
-; CHECK-NEXT: vp<%6> = vector-pointer ir<%ld.addr>
-; CHECK-NEXT: WIDEN ir<%ld> = load vp<%6>
-; CHECK-NEXT: WIDEN ir<%select.cmp> = icmp slt ir<%a>, ir<%ld>
-; CHECK-NEXT: EMIT vp<%7> = any-of ir<%select.cmp>
-; CHECK-NEXT: EMIT vp<%8> = select vp<%7>, ir<%select.cmp>, vp<%4>
-; CHECK-NEXT: EMIT vp<%9> = select vp<%7>, ir<%ld>, ir<%data.phi>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1>
-; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT: EMIT vp<[[CIV:%.*]]> = CANONICAL-INDUCTION ir<0>, vp<[[INDEXNEXT:%.*]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[DATAPHI:%.*]]> = phi ir<-1>, vp<[[DATASELECT:%.*]]>
+; CHECK-NEXT: WIDEN-PHI vp<[[MASKPHI:%.*]]> = phi [ ir<false>, vector.ph ], [ vp<[[MASKSELECT:%.*]]>, vector.body ]
+; CHECK-NEXT: vp<[[STEPS:%.*]]> = SCALAR-STEPS vp<[[CIV]]>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT: CLONE ir<[[LDADDR:%.*]]> = getelementptr inbounds ir<%data>, vp<[[STEPS:%.*]]>
+; CHECK-NEXT: vp<[[VPTR:%.*]]> = vector-pointer ir<[[LDADDR]]>
+; CHECK-NEXT: WIDEN ir<[[LD:%.*]]> = load vp<[[VPTR]]>
+; CHECK-NEXT: WIDEN ir<[[SELECTCMP:%.*]]> = icmp slt ir<%a>, ir<[[LD]]>
+; CHECK-NEXT: EMIT vp<[[ANYOF:%.*]]> = any-of ir<[[SELECTCMP]]>
+; CHECK-NEXT: EMIT vp<[[MASKSELECT]]> = select vp<[[ANYOF]]>, ir<[[SELECTCMP]]>, vp<[[MASKPHI]]>
+; CHECK-NEXT: EMIT vp<[[DATASELECT]]> = select vp<[[ANYOF]]>, ir<[[LD]]>, ir<[[DATAPHI]]>
+; CHECK-NEXT: EMIT vp<[[INDEXNEXT]]> = add nuw vp<[[CIV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT branch-on-count vp<[[INDEXNEXT]]>, vp<[[VECTC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<%11> = extract-last-active vp<%9>, vp<%8>, ir<-1>
-; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%N>, vp<%2>
-; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT: EMIT vp<[[EXTRACTLAST:%.*]]> = extract-last-active vp<[[DATASELECT]]>, vp<[[MASKSELECT]]>, ir<-1>
+; CHECK-NEXT: EMIT vp<[[TCCMP:%.*]]> = icmp eq ir<[[ORIGTC]]>, vp<[[VECTC]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[TCCMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT: IR %select.data.lcssa = phi i32 [ %select.data, %loop ] (extra operand: vp<%11> from middle.block)
+; CHECK-NEXT: IR [[SELECTLCSSA:%.*]] = phi i32 [ [[SELECTDATA:%.*]], %loop ] (extra operand: vp<[[EXTRACTLAST]]> from middle.block)
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<-1>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<[[RESUMEVAL:%.*]]> = phi [ vp<[[VECTC]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<[[MERGERDX:%.*]]> = phi [ vp<[[EXTRACTLAST]]>, middle.block ], [ ir<-1>, ir-bb<entry> ]
; CHECK-NEXT: Successor(s): ir-bb<loop>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
-; CHECK-NEXT: IR %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
-; CHECK-NEXT: IR %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
-; CHECK-NEXT: IR %ld = load i32, ptr %ld.addr, align 4
-; CHECK-NEXT: IR %select.cmp = icmp slt i32 %a, %ld
-; CHECK-NEXT: IR %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
-; CHECK-NEXT: IR %iv.next = add nuw nsw i64 %iv, 1
-; CHECK-NEXT: IR %exit.cmp = icmp eq i64 %iv.next, %N
+; CHECK-NEXT: IR [[IV:%.*]] = phi i64 [ 0, %entry ], [ [[IVNEXT:%.*]], %loop ] (extra operand: vp<[[RESUMEVAL]]> from scalar.ph)
+; CHECK-NEXT: IR [[DATAPHI]] = phi i32 [ -1, %entry ], [ [[SELECTDATA]], %loop ] (extra operand: vp<[[MERGERDX]]> from scalar.ph)
+; CHECK-NEXT: IR [[LDADDR]] = getelementptr inbounds i32, ptr %data, i64 [[IV]]
+; CHECK-NEXT: IR [[LD]] = load i32, ptr [[LDADDR]], align 4
+; CHECK-NEXT: IR [[SELECTCMP]] = icmp slt i32 %a, [[LD]]
+; CHECK-NEXT: IR [[SELECTDATA]] = select i1 [[SELECTCMP]], i32 [[LD]], i32 [[DATAPHI]]
+; CHECK-NEXT: IR [[IVNEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: IR [[EXITCMP:%.*]] = icmp eq i64 [[IVNEXT]], [[ORIGTC]]
; CHECK-NEXT: No successors
; CHECK-NEXT: }
-; CHECK: Cost of 1 for VF vscale x 1: induction instruction %iv.next = add nuw nsw i64 %iv, 1
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: exit condition instruction %exit.cmp = icmp eq i64 %iv.next, %N
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%9>
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-PHI vp<%4> = phi [ ir<false>, vector.ph ], [ vp<%8>, vector.body ]
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0>
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5>
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%6> = vector-pointer ir<%ld.addr>
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<%ld> = load vp<%6>
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<%select.cmp> = icmp slt ir<%a>, ir<%ld>
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%7> = any-of ir<%select.cmp>
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%8> = select vp<%7>, ir<%select.cmp>, vp<%4>
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%9> = select vp<%7>, ir<%ld>, ir<%data.phi>
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1>
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK: Cost of 1 for VF vscale x 1: induction instruction [[IVNEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: induction instruction [[IV]] = phi i64 [ 0, %entry ], [ [[IVNEXT]], %loop ]
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: exit condition instruction [[EXITCMP:%.*]] = icmp eq i64 [[IVNEXT]], [[ORIGTC]]
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<[[CIV]]> = CANONICAL-INDUCTION ir<0>, vp<[[INDEXNEXT]]>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-REDUCTION-PHI ir<[[DATAPHI]]> = phi ir<-1>, vp<[[DATASELECT]]>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-PHI vp<[[MASKPHI]]> = phi [ ir<false>, vector.ph ], [ vp<[[MASKSELECT]]>, vector.body ]
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<[[STEPS:%.*]]> = SCALAR-STEPS vp<[[CIV]]>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: CLONE ir<[[LDADDR]]> = getelementptr inbounds ir<%data>, vp<[[STEPS:%.*]]>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<[[VPTR]]> = vector-pointer ir<[[LDADDR]]>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<[[LD]]> = load vp<[[VPTR]]>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<[[SELECTCMP]]> = icmp slt ir<%a>, ir<[[LD]]>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<[[ANYOF]]> = any-of ir<[[SELECTCMP]]>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<[[MASKSELECT]]> = select vp<[[ANYOF]]>, ir<[[SELECTCMP]]>, vp<[[MASKPHI]]>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<[[DATASELECT]]> = select vp<[[ANYOF]]>, ir<[[LD]]>, ir<[[DATAPHI]]>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<[[INDEXNEXT]]> = add nuw vp<[[CIV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-count vp<[[INDEXNEXT]]>, vp<[[VECTC]]>
; CHECK-NEXT: Cost of 1 for VF vscale x 1: vector loop backedge
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<-1>, ir-bb<entry> ]
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %ld = load i32, ptr %ld.addr, align 4
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.cmp = icmp slt i32 %a, %ld
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %iv.next = add nuw nsw i64 %iv, 1
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %exit.cmp = icmp eq i64 %iv.next, %N
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%11> = extract-last-active vp<%9>, vp<%8>, ir<-1>
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%cmp.n> = icmp eq ir<%N>, vp<%2>
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.data.lcssa = phi i32 [ %select.data, %loop ] (extra operand: vp<%11> from middle.block)
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<[[RESUMEVAL]]> = phi [ vp<[[VECTC]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<[[MERGERDX]]> = phi [ vp<[[EXTRACTLAST]]>, middle.block ], [ ir<-1>, ir-bb<entry> ]
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[IV]] = phi i64 [ 0, %entry ], [ [[IVNEXT]], %loop ] (extra operand: vp<[[RESUMEVAL]]> from scalar.ph)
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[DATAPHI]] = phi i32 [ -1, %entry ], [ [[SELECTDATA]], %loop ] (extra operand: vp<[[MERGERDX]]> from scalar.ph)
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[LDADDR]] = getelementptr inbounds i32, ptr %data, i64 [[IV]]
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[LD]] = load i32, ptr [[LDADDR]], align 4
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[SELECTCMP]] = icmp slt i32 %a, [[LD]]
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[SELECTDATA]] = select i1 [[SELECTCMP]], i32 [[LD]], i32 [[DATAPHI]]
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[IVNEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[EXITCMP:%.*]] = icmp eq i64 [[IVNEXT]], [[ORIGTC]]
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<[[EXTRACTLAST]]> = extract-last-active vp<[[DATASELECT]]>, vp<[[MASKSELECT]]>, ir<-1>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<[[TCCMP]]> = icmp eq ir<[[ORIGTC]]>, vp<[[VECTC]]>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-cond vp<[[TCCMP]]>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[SELECTLCSSA]] = phi i32 [ [[SELECTDATA]], %loop ] (extra operand: vp<[[EXTRACTLAST]]> from middle.block)
>From ced78584aff76e6dfb22d1f22cb2ba0c5a79742d Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 12 Nov 2025 15:22:22 +0000
Subject: [PATCH 06/21] Use find_if
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 17 +++++++----------
1 file changed, 7 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 77d5ee0f263e6..8ecc4a0c603bf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4956,18 +4956,15 @@ void VPlanTransforms::convertFindLastRecurrences(
SR->replaceAllUsesWith(DataSelect);
SR->eraseFromParent();
- // Find final reduction and replace it with an
+ // Find final reduction computation and replace it with an
// extract.last.active intrinsic.
- VPInstruction *RdxResult = nullptr;
- for (VPUser *U : DataSelect->users()) {
+ VPUser **ComputeRdx = find_if(DataSelect->users(), [](VPUser *U) {
VPInstruction *I = dyn_cast<VPInstruction>(U);
- if (I && I->getOpcode() == VPInstruction::ComputeReductionResult) {
- RdxResult = I;
- break;
- }
- }
-
- assert(RdxResult);
+ return I && I->getOpcode() == VPInstruction::ComputeReductionResult;
+ });
+ assert(ComputeRdx != DataSelect->user_end() &&
+ "Unable to find Reduction Result Recipe");
+ VPInstruction *RdxResult = cast<VPInstruction>(*ComputeRdx);
Builder.setInsertPoint(RdxResult);
auto *ExtractLastActive =
Builder.createNaryOp(VPInstruction::ExtractLastActive,
>From b68b2bd2ed9737d37d5f76d80683cba093c5cb15 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 19 Nov 2025 11:54:39 +0000
Subject: [PATCH 07/21] Handle FindLast properly in unrolling, test
---
llvm/lib/Transforms/Utils/LoopUnroll.cpp | 3 +-
.../LoopUnroll/partial-unroll-reductions.ll | 52 +++++++++++++++++++
2 files changed, 54 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 5b94897f4342f..5b6c311a0dcff 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -1260,7 +1260,8 @@ llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
if (!RecurrenceDescriptor::isIntegerRecurrenceKind(RK) ||
RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
RecurrenceDescriptor::isFindIVRecurrenceKind(RK) ||
- RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
+ RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
+ RecurrenceDescriptor::isFindLastRecurrenceKind(RK))
return std::nullopt;
if (RdxDesc.IntermediateStore)
diff --git a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
index 220a4a29a3041..cdbcfb1a4c605 100644
--- a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
+++ b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
@@ -677,3 +677,55 @@ loop:
exit:
ret <4 x i32> %rdx.next
}
+
+define i32 @test_findlast_reduction(ptr %data, i32 %a) {
+; CHECK-LABEL: define i32 @test_findlast_reduction(
+; CHECK-SAME: ptr [[DATA:%.*]], i32 [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
+; CHECK-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
+; CHECK-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
+; CHECK-NEXT: [[SELECT_DATA:%.*]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
+; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[LD_ADDR_1:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV_NEXT]]
+; CHECK-NEXT: [[LD_1:%.*]] = load i32, ptr [[LD_ADDR_1]], align 4
+; CHECK-NEXT: [[SELECT_CMP_1:%.*]] = icmp slt i32 [[A]], [[LD_1]]
+; CHECK-NEXT: [[SELECT_DATA_1:%.*]] = select i1 [[SELECT_CMP_1]], i32 [[LD_1]], i32 [[SELECT_DATA]]
+; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT: [[LD_ADDR_2:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT: [[LD_2:%.*]] = load i32, ptr [[LD_ADDR_2]], align 4
+; CHECK-NEXT: [[SELECT_CMP_2:%.*]] = icmp slt i32 [[A]], [[LD_2]]
+; CHECK-NEXT: [[SELECT_DATA_2:%.*]] = select i1 [[SELECT_CMP_2]], i32 [[LD_2]], i32 [[SELECT_DATA_1]]
+; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT: [[LD_ADDR_3:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT: [[LD_3:%.*]] = load i32, ptr [[LD_ADDR_3]], align 4
+; CHECK-NEXT: [[SELECT_CMP_3:%.*]] = icmp slt i32 [[A]], [[LD_3]]
+; CHECK-NEXT: [[SELECT_DATA_3]] = select i1 [[SELECT_CMP_3]], i32 [[LD_3]], i32 [[SELECT_DATA_2]]
+; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT: [[EXIT_CMP_3:%.*]] = icmp eq i64 [[IV_NEXT_3]], 200
+; CHECK-NEXT: br i1 [[EXIT_CMP_3]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA_3]], %[[LOOP]] ]
+; CHECK-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ]
+ %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
+ %ld = load i32, ptr %ld.addr, align 4
+ %select.cmp = icmp slt i32 %a, %ld
+ %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cmp = icmp eq i64 %iv.next, 200
+ br i1 %exit.cmp, label %exit, label %loop
+
+exit:
+ ret i32 %select.data
+}
>From abd3a56131ab14d72211c545d8cc35faee16b8de Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 19 Nov 2025 12:01:48 +0000
Subject: [PATCH 08/21] Remove instcombine from AArch64 FindLast runlines
---
.../AArch64/conditional-scalar-assignment.ll | 94 +++++++++++--------
1 file changed, 53 insertions(+), 41 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
index 25c698f3df245..8b80c161c438c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
-; RUN: opt -passes=loop-vectorize,instcombine -S < %s 2>&1 | FileCheck %s --check-prefix=NEON
-; RUN: opt -passes=loop-vectorize,instcombine -mattr=+sve -S < %s 2>&1 | FileCheck %s --check-prefix=SVE
+; RUN: opt -passes=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=NEON
+; RUN: opt -passes=loop-vectorize -mattr=+sve -S < %s 2>&1 | FileCheck %s --check-prefix=SVE
;; The following run line caused an ICE before using a dedicated FindLast PHI recipe.
;; We're not looking at the resulting IR, just confirming it doesn't crash.
-; RUN: opt -passes=loop-vectorize,instcombine -mattr=+sve -epilogue-vectorization-force-VF=4 -S < %s 2>&1 > /dev/null
+; RUN: opt -passes=loop-vectorize -mattr=+sve -epilogue-vectorization-force-VF=4 -S < %s 2>&1 > /dev/null
target triple = "aarch64-linux-gnu"
@@ -16,7 +16,7 @@ define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) {
; NEON: [[LOOP]]:
; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
-; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]]
+; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
@@ -24,41 +24,41 @@ define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) {
; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
; NEON: [[EXIT]]:
-; NEON-NEXT: ret i32 [[SELECT_DATA]]
+; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
+; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
;
; SVE-LABEL: define i32 @simple_csa_int_select(
; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] {
; SVE-NEXT: [[ENTRY:.*]]:
-; SVE-NEXT: [[A_FR:%.*]] = freeze i32 [[A]]
; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; SVE: [[VECTOR_PH]]:
; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[A_FR]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i64 0
; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; SVE-NEXT: br label %[[VECTOR_BODY:.*]]
; SVE: [[VECTOR_BODY]]:
; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
; SVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]]
; SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
-; SVE-NEXT: [[WIDE_LOAD_FR:%.*]] = freeze <vscale x 4 x i32> [[WIDE_LOAD]]
-; SVE-NEXT: [[TMP7:%.*]] = icmp slt <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD_FR]]
+; SVE-NEXT: [[TMP13:%.*]] = icmp slt <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; SVE-NEXT: [[TMP7:%.*]] = freeze <vscale x 4 x i1> [[TMP13]]
; SVE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP7]])
-; SVE-NEXT: [[TMP9]] = select i1 [[TMP8]], <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> [[LAST_ACTIVE_MASK]]
-; SVE-NEXT: [[TMP10]] = select i1 [[TMP8]], <vscale x 4 x i32> [[WIDE_LOAD_FR]], <vscale x 4 x i32> [[VEC_PHI]]
+; SVE-NEXT: [[TMP9]] = select i1 [[TMP8]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[TMP4]]
+; SVE-NEXT: [[TMP10]] = select i1 [[TMP8]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; SVE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SVE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; SVE: [[MIDDLE_BLOCK]]:
; SVE-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i1> [[TMP9]], i32 -1)
-; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; SVE: [[SCALAR_PH]]:
; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -69,7 +69,7 @@ define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) {
; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
-; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A_FR]], [[LD]]
+; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
@@ -104,7 +104,7 @@ define ptr @simple_csa_ptr_select(i64 %N, ptr %data, i64 %a, ptr %init) {
; NEON: [[LOOP]]:
; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; NEON-NEXT: [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
-; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw ptr, ptr [[DATA]], i64 [[IV]]
+; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[IV]]
; NEON-NEXT: [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4
; NEON-NEXT: [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64
; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]]
@@ -113,7 +113,8 @@ define ptr @simple_csa_ptr_select(i64 %N, ptr %data, i64 %a, ptr %init) {
; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
; NEON: [[EXIT]]:
-; NEON-NEXT: ret ptr [[SELECT_DATA]]
+; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi ptr [ [[SELECT_DATA]], %[[LOOP]] ]
+; NEON-NEXT: ret ptr [[SELECT_DATA_LCSSA]]
;
; SVE-LABEL: define ptr @simple_csa_ptr_select(
; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]], ptr [[INIT:%.*]]) #[[ATTR0]] {
@@ -122,7 +123,7 @@ define ptr @simple_csa_ptr_select(i64 %N, ptr %data, i64 %a, ptr %init) {
; SVE: [[LOOP]]:
; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; SVE-NEXT: [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
-; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw ptr, ptr [[DATA]], i64 [[IV]]
+; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[IV]]
; SVE-NEXT: [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4
; SVE-NEXT: [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64
; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]]
@@ -131,7 +132,8 @@ define ptr @simple_csa_ptr_select(i64 %N, ptr %data, i64 %a, ptr %init) {
; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
; SVE: [[EXIT]]:
-; SVE-NEXT: ret ptr [[SELECT_DATA]]
+; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi ptr [ [[SELECT_DATA]], %[[LOOP]] ]
+; SVE-NEXT: ret ptr [[SELECT_DATA_LCSSA]]
;
entry:
br label %loop
@@ -160,7 +162,7 @@ define float @simple_csa_float_select(i64 %N, ptr %data, float %a) {
; NEON: [[LOOP]]:
; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; NEON-NEXT: [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
-; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw float, ptr [[DATA]], i64 [[IV]]
+; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
; NEON-NEXT: [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4
; NEON-NEXT: [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]]
; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]]
@@ -168,7 +170,8 @@ define float @simple_csa_float_select(i64 %N, ptr %data, float %a) {
; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
; NEON: [[EXIT]]:
-; NEON-NEXT: ret float [[SELECT_DATA]]
+; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP]] ]
+; NEON-NEXT: ret float [[SELECT_DATA_LCSSA]]
;
; SVE-LABEL: define float @simple_csa_float_select(
; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) #[[ATTR0]] {
@@ -177,7 +180,7 @@ define float @simple_csa_float_select(i64 %N, ptr %data, float %a) {
; SVE: [[LOOP]]:
; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; SVE-NEXT: [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
-; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw float, ptr [[DATA]], i64 [[IV]]
+; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
; SVE-NEXT: [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4
; SVE-NEXT: [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]]
; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]]
@@ -185,7 +188,8 @@ define float @simple_csa_float_select(i64 %N, ptr %data, float %a) {
; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
; SVE: [[EXIT]]:
-; SVE-NEXT: ret float [[SELECT_DATA]]
+; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP]] ]
+; SVE-NEXT: ret float [[SELECT_DATA_LCSSA]]
;
entry:
br label %loop
@@ -213,17 +217,18 @@ define i32 @multi_user_csa_int_select(i64 %N, ptr %data, ptr %results, i32 %a) {
; NEON: [[LOOP]]:
; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
-; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]]
+; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
-; NEON-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[RESULTS]], i64 [[IV]]
+; NEON-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds i32, ptr [[RESULTS]], i64 [[IV]]
; NEON-NEXT: store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4
; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
; NEON: [[EXIT]]:
-; NEON-NEXT: ret i32 [[SELECT_DATA]]
+; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
+; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
;
; SVE-LABEL: define i32 @multi_user_csa_int_select(
; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], ptr [[RESULTS:%.*]], i32 [[A:%.*]]) #[[ATTR0]] {
@@ -232,17 +237,18 @@ define i32 @multi_user_csa_int_select(i64 %N, ptr %data, ptr %results, i32 %a) {
; SVE: [[LOOP]]:
; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
-; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]]
+; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
-; SVE-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[RESULTS]], i64 [[IV]]
+; SVE-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds i32, ptr [[RESULTS]], i64 [[IV]]
; SVE-NEXT: store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4
; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
; SVE: [[EXIT]]:
-; SVE-NEXT: ret i32 [[SELECT_DATA]]
+; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
+; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
;
entry:
br label %loop
@@ -274,7 +280,7 @@ define i32 @multi_use_cmp_for_csa_int_select(i64 %N, ptr %data, i32 %a) {
; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
; NEON-NEXT: [[IDX_PHI:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[SELECT_IDX:%.*]], %[[LOOP]] ]
-; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]]
+; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
@@ -283,8 +289,10 @@ define i32 @multi_use_cmp_for_csa_int_select(i64 %N, ptr %data, i32 %a) {
; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
; NEON: [[EXIT]]:
-; NEON-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX]] to i32
-; NEON-NEXT: [[RES:%.*]] = add i32 [[SELECT_DATA]], [[IDX]]
+; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
+; NEON-NEXT: [[SELECT_IDX_LCSSA:%.*]] = phi i64 [ [[SELECT_IDX]], %[[LOOP]] ]
+; NEON-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX_LCSSA]] to i32
+; NEON-NEXT: [[RES:%.*]] = add i32 [[IDX]], [[SELECT_DATA_LCSSA]]
; NEON-NEXT: ret i32 [[RES]]
;
; SVE-LABEL: define i32 @multi_use_cmp_for_csa_int_select(
@@ -295,7 +303,7 @@ define i32 @multi_use_cmp_for_csa_int_select(i64 %N, ptr %data, i32 %a) {
; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
; SVE-NEXT: [[IDX_PHI:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[SELECT_IDX:%.*]], %[[LOOP]] ]
-; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]]
+; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
@@ -304,8 +312,10 @@ define i32 @multi_use_cmp_for_csa_int_select(i64 %N, ptr %data, i32 %a) {
; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
; SVE: [[EXIT]]:
-; SVE-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX]] to i32
-; SVE-NEXT: [[RES:%.*]] = add i32 [[SELECT_DATA]], [[IDX]]
+; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
+; SVE-NEXT: [[SELECT_IDX_LCSSA:%.*]] = phi i64 [ [[SELECT_IDX]], %[[LOOP]] ]
+; SVE-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX_LCSSA]] to i32
+; SVE-NEXT: [[RES:%.*]] = add i32 [[IDX]], [[SELECT_DATA_LCSSA]]
; SVE-NEXT: ret i32 [[RES]]
;
entry:
@@ -339,11 +349,11 @@ define i32 @chained_select_for_csa_int_select(i64 %N, ptr %data1, ptr %data2, i3
; NEON: [[LOOP]]:
; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
-; NEON-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA1]], i64 [[IV]]
+; NEON-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
; NEON-NEXT: [[LD1:%.*]] = load i32, ptr [[LD1_ADDR]], align 4
; NEON-NEXT: [[SELECT_CMP1:%.*]] = icmp slt i32 [[A]], [[LD1]]
; NEON-NEXT: [[SELECT_LD1:%.*]] = select i1 [[SELECT_CMP1]], i32 [[LD1]], i32 [[DATA_PHI]]
-; NEON-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA2]], i64 [[IV]]
+; NEON-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA2]], i64 [[IV]]
; NEON-NEXT: [[LD2:%.*]] = load i32, ptr [[LD2_ADDR]], align 4
; NEON-NEXT: [[SELECT_CMP2:%.*]] = icmp sgt i32 [[B]], [[LD2]]
; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP2]], i32 [[LD2]], i32 [[SELECT_LD1]]
@@ -351,7 +361,8 @@ define i32 @chained_select_for_csa_int_select(i64 %N, ptr %data1, ptr %data2, i3
; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
; NEON: [[EXIT]]:
-; NEON-NEXT: ret i32 [[SELECT_DATA]]
+; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
+; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
;
; SVE-LABEL: define i32 @chained_select_for_csa_int_select(
; SVE-SAME: i64 [[N:%.*]], ptr [[DATA1:%.*]], ptr [[DATA2:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] {
@@ -360,11 +371,11 @@ define i32 @chained_select_for_csa_int_select(i64 %N, ptr %data1, ptr %data2, i3
; SVE: [[LOOP]]:
; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
-; SVE-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA1]], i64 [[IV]]
+; SVE-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
; SVE-NEXT: [[LD1:%.*]] = load i32, ptr [[LD1_ADDR]], align 4
; SVE-NEXT: [[SELECT_CMP1:%.*]] = icmp slt i32 [[A]], [[LD1]]
; SVE-NEXT: [[SELECT_LD1:%.*]] = select i1 [[SELECT_CMP1]], i32 [[LD1]], i32 [[DATA_PHI]]
-; SVE-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA2]], i64 [[IV]]
+; SVE-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA2]], i64 [[IV]]
; SVE-NEXT: [[LD2:%.*]] = load i32, ptr [[LD2_ADDR]], align 4
; SVE-NEXT: [[SELECT_CMP2:%.*]] = icmp sgt i32 [[B]], [[LD2]]
; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP2]], i32 [[LD2]], i32 [[SELECT_LD1]]
@@ -372,7 +383,8 @@ define i32 @chained_select_for_csa_int_select(i64 %N, ptr %data1, ptr %data2, i3
; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
; SVE: [[EXIT]]:
-; SVE-NEXT: ret i32 [[SELECT_DATA]]
+; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
+; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
;
entry:
br label %loop
>From f46dce1588d1d3d25ece31c7ed91bd6fc65397d9 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 19 Nov 2025 12:15:07 +0000
Subject: [PATCH 09/21] Switched vplan print test to use fixed VF instead of
scalable
---
.../conditional-scalar-assignment-vplan.ll | 38 +------------------
1 file changed, 2 insertions(+), 36 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
index 23964f65b7aae..79e5ca2cc7a1d 100644
--- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
@@ -1,6 +1,5 @@
; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
-; RUN: -scalable-vectorization=on -force-target-supports-scalable-vectors \
-; RUN: -disable-output 2>&1 < %s | FileCheck %s
+; RUN: -force-vector-width=4 -disable-output 2>&1 < %s | FileCheck %s
; This function is derived from the following C program:
@@ -31,8 +30,7 @@ exit:
ret i32 %select.data
}
-
-; CHECK: VPlan 'Initial VPlan for VF={vscale x 1},UF>=1' {
+; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
; CHECK-NEXT: Live-in vp<[[VF:%.*]]> = VF
; CHECK-NEXT: Live-in vp<[[VFxUF:%.*]]> = VF * UF
; CHECK-NEXT: Live-in vp<[[VECTC:%.*]]> = vector-trip-count
@@ -89,35 +87,3 @@ exit:
; CHECK-NEXT: IR [[EXITCMP:%.*]] = icmp eq i64 [[IVNEXT]], [[ORIGTC]]
; CHECK-NEXT: No successors
; CHECK-NEXT: }
-
-; CHECK: Cost of 1 for VF vscale x 1: induction instruction [[IVNEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: induction instruction [[IV]] = phi i64 [ 0, %entry ], [ [[IVNEXT]], %loop ]
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: exit condition instruction [[EXITCMP:%.*]] = icmp eq i64 [[IVNEXT]], [[ORIGTC]]
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<[[CIV]]> = CANONICAL-INDUCTION ir<0>, vp<[[INDEXNEXT]]>
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-REDUCTION-PHI ir<[[DATAPHI]]> = phi ir<-1>, vp<[[DATASELECT]]>
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-PHI vp<[[MASKPHI]]> = phi [ ir<false>, vector.ph ], [ vp<[[MASKSELECT]]>, vector.body ]
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<[[STEPS:%.*]]> = SCALAR-STEPS vp<[[CIV]]>, ir<1>, vp<[[VF]]>
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: CLONE ir<[[LDADDR]]> = getelementptr inbounds ir<%data>, vp<[[STEPS:%.*]]>
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<[[VPTR]]> = vector-pointer ir<[[LDADDR]]>
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<[[LD]]> = load vp<[[VPTR]]>
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<[[SELECTCMP]]> = icmp slt ir<%a>, ir<[[LD]]>
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<[[ANYOF]]> = any-of ir<[[SELECTCMP]]>
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<[[MASKSELECT]]> = select vp<[[ANYOF]]>, ir<[[SELECTCMP]]>, vp<[[MASKPHI]]>
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<[[DATASELECT]]> = select vp<[[ANYOF]]>, ir<[[LD]]>, ir<[[DATAPHI]]>
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<[[INDEXNEXT]]> = add nuw vp<[[CIV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-count vp<[[INDEXNEXT]]>, vp<[[VECTC]]>
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: vector loop backedge
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<[[RESUMEVAL]]> = phi [ vp<[[VECTC]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<[[MERGERDX]]> = phi [ vp<[[EXTRACTLAST]]>, middle.block ], [ ir<-1>, ir-bb<entry> ]
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[IV]] = phi i64 [ 0, %entry ], [ [[IVNEXT]], %loop ] (extra operand: vp<[[RESUMEVAL]]> from scalar.ph)
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[DATAPHI]] = phi i32 [ -1, %entry ], [ [[SELECTDATA]], %loop ] (extra operand: vp<[[MERGERDX]]> from scalar.ph)
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[LDADDR]] = getelementptr inbounds i32, ptr %data, i64 [[IV]]
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[LD]] = load i32, ptr [[LDADDR]], align 4
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[SELECTCMP]] = icmp slt i32 %a, [[LD]]
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[SELECTDATA]] = select i1 [[SELECTCMP]], i32 [[LD]], i32 [[DATAPHI]]
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[IVNEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[EXITCMP:%.*]] = icmp eq i64 [[IVNEXT]], [[ORIGTC]]
-; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<[[EXTRACTLAST]]> = extract-last-active vp<[[DATASELECT]]>, vp<[[MASKSELECT]]>, ir<-1>
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<[[TCCMP]]> = icmp eq ir<[[ORIGTC]]>, vp<[[VECTC]]>
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-cond vp<[[TCCMP]]>
-; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[SELECTLCSSA]] = phi i32 [ [[SELECTDATA]], %loop ] (extra operand: vp<[[EXTRACTLAST]]> from middle.block)
>From ecd2f7b8d10294fc270435b7ae35b61f74ca144e Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 19 Nov 2025 14:03:42 +0000
Subject: [PATCH 10/21] Moved vectorized epilogue ICE test to separate file,
removed unnecessary check lines
---
.../AArch64/conditional-scalar-assignment.ll | 4 -
.../conditional-scalar-assignment-vplan.ll | 12 ---
.../AArch64/findlast-epilogue-loop.ll | 78 +++++++++++++++++++
3 files changed, 78 insertions(+), 16 deletions(-)
create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/findlast-epilogue-loop.ll
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
index 8b80c161c438c..01e76cae2db7f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
@@ -2,10 +2,6 @@
; RUN: opt -passes=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=NEON
; RUN: opt -passes=loop-vectorize -mattr=+sve -S < %s 2>&1 | FileCheck %s --check-prefix=SVE
-;; The following run line caused an ICE before using a dedicated FindLast PHI recipe.
-;; We're not looking at the resulting IR, just confirming it doesn't crash.
-; RUN: opt -passes=loop-vectorize -mattr=+sve -epilogue-vectorization-force-VF=4 -S < %s 2>&1 > /dev/null
-
target triple = "aarch64-linux-gnu"
define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) {
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
index 79e5ca2cc7a1d..64dc57f7ec492 100644
--- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
@@ -75,15 +75,3 @@ exit:
; CHECK-NEXT: EMIT-SCALAR vp<[[RESUMEVAL:%.*]]> = phi [ vp<[[VECTC]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
; CHECK-NEXT: EMIT-SCALAR vp<[[MERGERDX:%.*]]> = phi [ vp<[[EXTRACTLAST]]>, middle.block ], [ ir<-1>, ir-bb<entry> ]
; CHECK-NEXT: Successor(s): ir-bb<loop>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT: IR [[IV:%.*]] = phi i64 [ 0, %entry ], [ [[IVNEXT:%.*]], %loop ] (extra operand: vp<[[RESUMEVAL]]> from scalar.ph)
-; CHECK-NEXT: IR [[DATAPHI]] = phi i32 [ -1, %entry ], [ [[SELECTDATA]], %loop ] (extra operand: vp<[[MERGERDX]]> from scalar.ph)
-; CHECK-NEXT: IR [[LDADDR]] = getelementptr inbounds i32, ptr %data, i64 [[IV]]
-; CHECK-NEXT: IR [[LD]] = load i32, ptr [[LDADDR]], align 4
-; CHECK-NEXT: IR [[SELECTCMP]] = icmp slt i32 %a, [[LD]]
-; CHECK-NEXT: IR [[SELECTDATA]] = select i1 [[SELECTCMP]], i32 [[LD]], i32 [[DATAPHI]]
-; CHECK-NEXT: IR [[IVNEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: IR [[EXITCMP:%.*]] = icmp eq i64 [[IVNEXT]], [[ORIGTC]]
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/findlast-epilogue-loop.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/findlast-epilogue-loop.ll
new file mode 100644
index 0000000000000..8ceff41be73b3
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/findlast-epilogue-loop.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes=loop-vectorize -mattr=+sve -epilogue-vectorization-force-VF=4 -S < %s 2>&1 | FileCheck %s
+
+; This test is mainly confirming that we don't crash when vectorizing a findlast
+; reduction and trying to use a vectorized epilogue loop. Once support for that
+; has been added, this test can be removed.
+
+target triple = "aarch64-linux-gnu"
+
+define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) {
+; CHECK-LABEL: define i32 @simple_csa_int_select(
+; CHECK-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP7:%.*]] = freeze <vscale x 4 x i1> [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP7]])
+; CHECK-NEXT: [[TMP9]] = select i1 [[TMP8]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP4]]
+; CHECK-NEXT: [[TMP10]] = select i1 [[TMP8]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i1> [[TMP9]], i32 -1)
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
+; CHECK-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
+; CHECK-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
+; CHECK-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ]
+ %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
+ %ld = load i32, ptr %ld.addr, align 4
+ %select.cmp = icmp slt i32 %a, %ld
+ %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cmp = icmp eq i64 %iv.next, %N
+ br i1 %exit.cmp, label %exit, label %loop
+
+exit:
+ ret i32 %select.data
+}
>From f54363abb3b88fcaaf7bf930867a748717645a27 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 19 Nov 2025 16:43:32 +0000
Subject: [PATCH 11/21] Updated check output after rebase
---
.../LoopVectorize/iv-select-cmp-decreasing.ll | 8 +++----
.../LoopVectorize/iv-select-cmp-no-wrap.ll | 2 +-
.../iv-select-cmp-non-const-iv-start.ll | 16 ++++++-------
.../LoopVectorize/iv-select-cmp-trunc.ll | 24 +++++++++----------
.../Transforms/LoopVectorize/iv-select-cmp.ll | 12 +++++-----
5 files changed, 31 insertions(+), 31 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
index 503837894a7b4..cdd76957afcf1 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
@@ -1135,7 +1135,7 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
; IC1VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
; IC1VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; IC1VF4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 -1, i64 -2, i64 -3>
+; IC1VF4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 -1, i64 -2, i64 -3>
; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]]
; IC1VF4: [[VECTOR_BODY]]:
; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1160,7 +1160,7 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
; IC1VF4-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
; IC1VF4-NEXT: [[TMP15]] = select i1 [[TMP13]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4)
; IC1VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; IC1VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; IC1VF4: [[MIDDLE_BLOCK]]:
@@ -1204,7 +1204,7 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
; IC4VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
; IC4VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; IC4VF4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 -1, i64 -2, i64 -3>
+; IC4VF4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 -1, i64 -2, i64 -3>
; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]]
; IC4VF4: [[VECTOR_BODY]]:
; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1229,7 +1229,7 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
; IC4VF4-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
; IC4VF4-NEXT: [[TMP15]] = select i1 [[TMP13]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4)
; IC4VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; IC4VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; IC4VF4: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
index 18f1470aba3a5..377afaea8f7fd 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
@@ -168,7 +168,7 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) {
; CHECK-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
index 7a89c32b197d3..115956b977cee 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
@@ -21,7 +21,7 @@ define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %
; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0
; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK-VF4IC1: [[VECTOR_BODY]]:
; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -37,7 +37,7 @@ define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %
; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -83,7 +83,7 @@ define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %
; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0
; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK-VF4IC4: [[VECTOR_BODY]]:
; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -99,7 +99,7 @@ define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %
; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
@@ -192,7 +192,7 @@ define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start,
; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0
; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK-VF4IC1: [[VECTOR_BODY]]:
; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -208,7 +208,7 @@ define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start,
; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -257,7 +257,7 @@ define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start,
; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0
; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK-VF4IC4: [[VECTOR_BODY]]:
; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -273,7 +273,7 @@ define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start,
; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
index 839ea7ce7e7a4..ad5189c35cfd7 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
@@ -709,7 +709,7 @@ define i32 @select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) {
; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -763,7 +763,7 @@ define i32 @select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) {
; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
@@ -874,7 +874,7 @@ define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) {
; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -929,7 +929,7 @@ define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) {
; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
@@ -1037,7 +1037,7 @@ define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) {
; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512
; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -1066,7 +1066,7 @@ define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) {
; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512
; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
@@ -1143,7 +1143,7 @@ define i32 @not_vectorized_select_iv_icmp_no_guard(ptr %a, ptr %b, i32 %start, i
; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -1199,7 +1199,7 @@ define i32 @not_vectorized_select_iv_icmp_no_guard(ptr %a, ptr %b, i32 %start, i
; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
@@ -1296,7 +1296,7 @@ define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) {
; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -1338,7 +1338,7 @@ define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) {
; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
@@ -1433,7 +1433,7 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p
; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -1495,7 +1495,7 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p
; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
index 6001ee32ca62a..e3df1d3c39cca 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
@@ -1977,7 +1977,7 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -2035,7 +2035,7 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
@@ -2125,7 +2125,7 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0
; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK-VF4IC1: [[VECTOR_BODY]]:
; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -2143,7 +2143,7 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -2185,7 +2185,7 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0
; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK-VF4IC4: [[VECTOR_BODY]]:
; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -2203,7 +2203,7 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
>From efe2d2788a02b41314cb97c8a3078646ed47efbf Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 20 Nov 2025 12:03:41 +0000
Subject: [PATCH 12/21] Move epilogue vectorization test back to LV
---
.../AArch64/findlast-epilogue-loop.ll | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
rename llvm/test/Transforms/{PhaseOrdering => LoopVectorize}/AArch64/findlast-epilogue-loop.ll (95%)
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/findlast-epilogue-loop.ll b/llvm/test/Transforms/LoopVectorize/AArch64/findlast-epilogue-loop.ll
similarity index 95%
rename from llvm/test/Transforms/PhaseOrdering/AArch64/findlast-epilogue-loop.ll
rename to llvm/test/Transforms/LoopVectorize/AArch64/findlast-epilogue-loop.ll
index 8ceff41be73b3..8e4a8457414bc 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/findlast-epilogue-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/findlast-epilogue-loop.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
; RUN: opt -passes=loop-vectorize -mattr=+sve -epilogue-vectorization-force-VF=4 -S < %s 2>&1 | FileCheck %s
-; This test is mainly confirming that we don't crash when vectorizing a findlast
-; reduction and trying to use a vectorized epilogue loop. Once support for that
-; has been added, this test can be removed.
+;; This test is currently ensuring we don't crash when vectorizing loops with
+;; conditional scalar assignment when epilogue vectorization is either requested
+;; or costed as profitable.
target triple = "aarch64-linux-gnu"
>From bf3281bb2316451365772458eb3562b5b7acc2ac Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 20 Nov 2025 12:28:59 +0000
Subject: [PATCH 13/21] Improve IVDesc comments
---
llvm/lib/Analysis/IVDescriptors.cpp | 36 ++++++++++++++++++-----------
1 file changed, 23 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 3fbbcddcc91c1..3eaafd41ada92 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -672,9 +672,15 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi,
// if (src[i] > 3)
// r = i;
// }
+// or like this:
+// int r = 0;
+// for (int i = 0; i < n; i++) {
+// if (a[i] > 3)
+// r = a[i];
+// }
// The reduction value (r) is derived from either the values of an induction
-// variable (i) sequence, or from the start value (0). The LLVM IR generated for
-// such loops would be as follows:
+// variable (i) sequence, an arbitrary value (a[i]), or from the start value
+// (0). The LLVM IR generated for such loops would be as follows:
// for.body:
// %r = phi i32 [ %spec.select, %for.body ], [ 0, %entry ]
// %i = phi i32 [ %inc, %for.body ], [ 0, %entry ]
@@ -683,19 +689,23 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi,
// %spec.select = select i1 %cmp, i32 %i, i32 %r
// %inc = add nsw i32 %i, 1
// ...
-// Since 'i' is an induction variable, the reduction value after the loop will
-// be the maximum (increasing induction) or minimum (decreasing induction) value
-// of 'i' that the condition (src[i] > 3) is satisfied, or the start value (0 in
-// the example above). When the start value of the induction variable 'i' is
-// greater than the minimum (increasing induction) or maximum (decreasing
-// induction) value of the data type, we can use the minimum (increasing
-// induction) or maximum (decreasing induction) value of the data type as a
-// sentinel value to replace the start value. This allows us to perform a single
-// reduction max (increasing induction) or min (decreasing induction) operation
-// to obtain the final reduction result.
+// When searching for an induction variable (i), the reduction value after the
+// loop will be the maximum (increasing induction) or minimum (decreasing
+// induction) value of 'i' that the condition (src[i] > 3) is satisfied, or the
+// start value (0 in the example above). When the start value of the induction
+// variable 'i' is greater than the minimum (increasing induction) or maximum
+// (decreasing induction) value of the data type, we can use the minimum
+// (increasing induction) or maximum (decreasing induction) value of the data
+// type as a sentinel value to replace the start value. This allows us to
+// perform a single reduction max (increasing induction) or min (decreasing
+// induction) operation to obtain the final reduction result.
// TODO: It is possible to solve the case where the start value is the minimum
// value of the data type or a non-constant value by using mask and multiple
// reduction operations.
+//
+// When searching for an arbitrary value (such as 'a[i]'), the reduction value
+// will either be the initial value (0) if the condition was never met, or the
+// value of a[i] in the most recent loop iteration where the condition was met.
RecurrenceDescriptor::InstDesc
RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop,
PHINode *OrigPhi, Instruction *I,
@@ -710,7 +720,7 @@ RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop,
// We are looking for selects of the form:
// select(cmp(), phi, value) or
// select(cmp(), value, phi)
- // where 'value' is be a loop induction variable
+ // where 'value' must be a loop induction variable
// (for FindFirstIV/FindLastIV) or an arbitrary value (for FindLast).
// TODO: Match selects with multi-use cmp conditions.
Value *NonRdxPhi = nullptr;
>From cf3bc322014ad3981c38237de9050aa19c0d60d6 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 20 Nov 2025 12:30:11 +0000
Subject: [PATCH 14/21] Remove traces of dedicated FindLast phi recipe
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1 -
llvm/lib/Transforms/Vectorize/VPlan.h | 1 -
llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 -
3 files changed, 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e1064b605bbe8..ee13dfe30280c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4084,7 +4084,6 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
continue;
case VPDef::VPReductionSC:
case VPDef::VPActiveLaneMaskPHISC:
- case VPDef::VPLastActiveMaskPHISC:
case VPDef::VPWidenCallSC:
case VPDef::VPWidenCanonicalIVSC:
case VPDef::VPWidenCastSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index b84b5cc424ec3..b89510b7d8385 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -561,7 +561,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPPredInstPHISC:
case VPRecipeBase::VPCanonicalIVPHISC:
case VPRecipeBase::VPActiveLaneMaskPHISC:
- case VPRecipeBase::VPLastActiveMaskPHISC:
case VPRecipeBase::VPFirstOrderRecurrencePHISC:
case VPRecipeBase::VPWidenPHISC:
case VPRecipeBase::VPWidenIntOrFpInductionSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 55e7af00f8fc4..63eacd3d75721 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -374,7 +374,6 @@ class VPDef {
// VPHeaderPHIRecipe need to be kept together.
VPCanonicalIVPHISC,
VPActiveLaneMaskPHISC,
- VPLastActiveMaskPHISC,
VPEVLBasedIVPHISC,
VPFirstOrderRecurrencePHISC,
VPWidenIntOrFpInductionSC,
>From a221ea46a3085cd609d3b849d9a5dcee3f093e45 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 20 Nov 2025 12:31:04 +0000
Subject: [PATCH 15/21] Move and improve convertFindLastRecurrences
---
.../Vectorize/VPlanConstruction.cpp | 63 ++++++++++++++++
.../Transforms/Vectorize/VPlanTransforms.cpp | 72 -------------------
2 files changed, 63 insertions(+), 72 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 5fbd61a929fe2..549efeaa1a1a2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1001,3 +1001,66 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
MiddleTerm->setOperand(0, NewCond);
return true;
}
+
+void VPlanTransforms::convertFindLastRecurrences(
+ VPlan &Plan, VPRecipeBuilder &RecipeBuilder) {
+ if (Plan.hasScalarVFOnly())
+ return;
+
+ // We want to create the following nodes:
+ // vec.body:
+ // mask.phi = phi <VF x i1> [ all.false, vec.ph ], [ new.mask, vec.body ]
+ // ...data.phi already exists, but needs updating...
+ // data.phi = phi <VF x Ty> [ default.val, vec.ph ], [ new.data, vec.body ]
+ //
+ // ...'data' and 'compare' created by existing nodes...
+ //
+ // any_active = i1 any_of_reduction(compare)
+ // new.mask = select any_active, compare, mask.phi
+ // new.data = select any_active, data, data.phi
+ //
+ // middle.block:
+ // result = extract-last-active new.data, new.mask, default.val
+
+ for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+ auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
+ if (!PhiR || !RecurrenceDescriptor::isFindLastRecurrenceKind(
+ PhiR->getRecurrenceKind()))
+ continue;
+
+ // Find the condition for the select
+ auto *SR = cast<VPWidenSelectRecipe>(&PhiR->getBackedgeRecipe());
+ VPValue *Cond = SR->getCond();
+
+ // Add mask phi
+ VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR);
+ VPValue *False = Plan.getOrAddLiveIn(
+ ConstantInt::getFalse(PhiR->getUnderlyingValue()->getContext()));
+ auto *MaskPHI = new VPWidenPHIRecipe(nullptr, False, DebugLoc());
+ Builder.insert(MaskPHI);
+
+ // Add select for mask
+ Builder.setInsertPoint(SR);
+ VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond});
+ VPValue *MaskSelect = Builder.createSelect(AnyOf, Cond, MaskPHI);
+ MaskPHI->addOperand(MaskSelect);
+
+ // Replace select for data
+ VPValue *DataSelect = Builder.createSelect(
+ AnyOf, SR->getOperand(1), SR->getOperand(2), SR->getDebugLoc());
+ SR->replaceAllUsesWith(DataSelect);
+ SR->eraseFromParent();
+
+ // Find final reduction computation and replace it with an
+ // extract.last.active intrinsic.
+ VPInstruction *RdxResult = findComputeReductionResult(PhiR);
+ assert(RdxResult && "Unable to find Reduction Result Recipe");
+ Builder.setInsertPoint(RdxResult);
+ auto *ExtractLastActive =
+ Builder.createNaryOp(VPInstruction::ExtractLastActive,
+ {DataSelect, MaskSelect, PhiR->getStartValue()},
+ RdxResult->getDebugLoc());
+ RdxResult->replaceAllUsesWith(ExtractLastActive);
+ RdxResult->eraseFromParent();
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8ecc4a0c603bf..8bf1003e923b1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4902,75 +4902,3 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
}
}
}
-
-void VPlanTransforms::convertFindLastRecurrences(
- VPlan &Plan, VPRecipeBuilder &RecipeBuilder) {
-
- // May need to do something better than this?
- if (Plan.hasScalarVFOnly())
- return;
-
- // We want to create the following nodes:
- // vec.body:
- // mask.phi = phi <VF x i1> [ all.false, vec.ph ], [ new.mask, vec.body ]
- // ...data.phi already exists, but needs updating...
- // data.phi = phi <VF x Ty> [ default.val, vec.ph ], [ new.data, vec.body ]
- //
- // ...'data' and 'compare' created by existing nodes...
- //
- // any_active = i1 any_of_reduction(compare)
- // new.mask = select any_active, compare, mask.phi
- // new.data = select any_active, data, data.phi
- //
- // middle.block:
- // result = extract-last-active new.data, new.mask, default.val
-
- for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
- auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
- if (!PhiR || !RecurrenceDescriptor::isFindLastRecurrenceKind(
- PhiR->getRecurrenceKind()))
- continue;
-
- // Find the condition for the select
- auto *SR = dyn_cast<VPWidenSelectRecipe>(&PhiR->getBackedgeRecipe());
- if (!SR)
- continue;
- VPValue *Cond = SR->getCond();
-
- // Add mask phi
- VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR);
- VPValue *False = Plan.getOrAddLiveIn(
- ConstantInt::getFalse(PhiR->getUnderlyingValue()->getContext()));
- auto *MaskPHI = new VPWidenPHIRecipe(nullptr, False, DebugLoc());
- Builder.insert(MaskPHI);
-
- // Add select for mask
- Builder.setInsertPoint(SR);
- VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond});
- VPValue *MaskSelect = Builder.createSelect(AnyOf, Cond, MaskPHI);
- MaskPHI->addOperand(MaskSelect);
-
- // Replace select for data
- VPValue *DataSelect = Builder.createSelect(
- AnyOf, SR->getOperand(1), SR->getOperand(2), SR->getDebugLoc());
- SR->replaceAllUsesWith(DataSelect);
- SR->eraseFromParent();
-
- // Find final reduction computation and replace it with an
- // extract.last.active intrinsic.
- VPUser **ComputeRdx = find_if(DataSelect->users(), [](VPUser *U) {
- VPInstruction *I = dyn_cast<VPInstruction>(U);
- return I && I->getOpcode() == VPInstruction::ComputeReductionResult;
- });
- assert(ComputeRdx != DataSelect->user_end() &&
- "Unable to find Reduction Result Recipe");
- VPInstruction *RdxResult = cast<VPInstruction>(*ComputeRdx);
- Builder.setInsertPoint(RdxResult);
- auto *ExtractLastActive =
- Builder.createNaryOp(VPInstruction::ExtractLastActive,
- {DataSelect, MaskSelect, PhiR->getStartValue()},
- RdxResult->getDebugLoc());
- RdxResult->replaceAllUsesWith(ExtractLastActive);
- RdxResult->eraseFromParent();
- }
-}
>From 023fbc93856fb44e3ef63fb2e010950302a68684 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 20 Nov 2025 12:31:38 +0000
Subject: [PATCH 16/21] Add test with extra user for select
---
.../AArch64/conditional-scalar-assignment.ll | 61 +++++++++++++++++++
1 file changed, 61 insertions(+)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
index 01e76cae2db7f..e777659f003c8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
@@ -403,3 +403,64 @@ loop:
exit:
ret i32 %select.data
}
+
+define i32 @csa_with_extra_use_of_select(i64 %N, ptr readonly %data, ptr noalias %out, i32 %a) {
+; NEON-LABEL: define i32 @csa_with_extra_use_of_select(
+; NEON-SAME: i64 [[N:%.*]], ptr readonly [[DATA:%.*]], ptr noalias [[OUT:%.*]], i32 [[A:%.*]]) {
+; NEON-NEXT: [[ENTRY:.*]]:
+; NEON-NEXT: br label %[[LOOP:.*]]
+; NEON: [[LOOP]]:
+; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
+; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
+; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
+; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
+; NEON-NEXT: [[ST_ADDR:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 [[IV]]
+; NEON-NEXT: store i32 [[SELECT_DATA]], ptr [[ST_ADDR]], align 4
+; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; NEON: [[EXIT]]:
+; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
+; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
+;
+; SVE-LABEL: define i32 @csa_with_extra_use_of_select(
+; SVE-SAME: i64 [[N:%.*]], ptr readonly [[DATA:%.*]], ptr noalias [[OUT:%.*]], i32 [[A:%.*]]) #[[ATTR0]] {
+; SVE-NEXT: [[ENTRY:.*]]:
+; SVE-NEXT: br label %[[LOOP:.*]]
+; SVE: [[LOOP]]:
+; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
+; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
+; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
+; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
+; SVE-NEXT: [[ST_ADDR:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 [[IV]]
+; SVE-NEXT: store i32 [[SELECT_DATA]], ptr [[ST_ADDR]], align 4
+; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; SVE: [[EXIT]]:
+; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
+; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ]
+ %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
+ %ld = load i32, ptr %ld.addr, align 4
+ %select.cmp = icmp slt i32 %a, %ld
+ %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
+ %st.addr = getelementptr inbounds i32, ptr %out, i64 %iv
+ store i32 %select.data, ptr %st.addr, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cmp = icmp eq i64 %iv.next, %N
+ br i1 %exit.cmp, label %exit, label %loop
+
+exit:
+ ret i32 %select.data
+}
>From 01a9be61ef752de0ffa91faeb2d5591f33460781 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 25 Nov 2025 13:54:01 +0000
Subject: [PATCH 17/21] Call xform earlier
---
.../Transforms/Vectorize/LoopVectorize.cpp | 9 +++--
.../Vectorize/VPlanConstruction.cpp | 39 +++++++++++++------
.../Transforms/Vectorize/VPlanTransforms.h | 14 ++++---
3 files changed, 41 insertions(+), 21 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ee13dfe30280c..4153140d4d02a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8558,6 +8558,11 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
*Plan))
return nullptr;
+ // Create whole-vector selects for find-last recurrences.
+ if (!VPlanTransforms::runPass(VPlanTransforms::handleFindLastReductions,
+ *Plan, RecipeBuilder))
+ return nullptr;
+
// Transform recipes to abstract recipes if it is legal and beneficial and
// clamp the range for better cost estimation.
// TODO: Enable following transform when the EVL-version of extended-reduction
@@ -8596,10 +8601,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
*Plan, Builder))
return nullptr;
- // Create whole-vector selects for find-last recurrences.
- VPlanTransforms::runPass(VPlanTransforms::convertFindLastRecurrences, *Plan,
- RecipeBuilder);
-
if (useActiveLaneMask(Style)) {
// TODO: Move checks to VPlanTransforms::addActiveLaneMask once
// TailFoldingStyle is visible there.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 549efeaa1a1a2..d7ada10b15896 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "LoopVectorizationPlanner.h"
+#include "VPRecipeBuilder.h"
#include "VPlan.h"
#include "VPlanCFG.h"
#include "VPlanDominatorTree.h"
@@ -1002,10 +1003,10 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
return true;
}
-void VPlanTransforms::convertFindLastRecurrences(
- VPlan &Plan, VPRecipeBuilder &RecipeBuilder) {
+bool VPlanTransforms::handleFindLastReductions(VPlan &Plan,
+ VPRecipeBuilder &RecipeBuilder) {
if (Plan.hasScalarVFOnly())
- return;
+ return false;
// We want to create the following nodes:
// vec.body:
@@ -1029,8 +1030,20 @@ void VPlanTransforms::convertFindLastRecurrences(
continue;
// Find the condition for the select
- auto *SR = cast<VPWidenSelectRecipe>(&PhiR->getBackedgeRecipe());
- VPValue *Cond = SR->getCond();
+ auto *SelectR = cast<VPSingleDefRecipe>(&PhiR->getBackedgeRecipe());
+ VPValue *Cond = nullptr;
+ if (auto *WidenR = dyn_cast<VPWidenSelectRecipe>(SelectR))
+ Cond = WidenR->getCond();
+ else if (auto *RepR = dyn_cast<VPReplicateRecipe>(SelectR)) {
+ auto *SI = dyn_cast<SelectInst>(RepR->getUnderlyingInstr());
+ if (!SI)
+ return false;
+ auto *CmpI = dyn_cast<Instruction>(SI->getCondition());
+ if (!CmpI)
+ return false;
+ Cond = RecipeBuilder.getRecipe(CmpI)->getVPSingleValue();
+ } else
+ return false;
// Add mask phi
VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR);
@@ -1040,21 +1053,23 @@ void VPlanTransforms::convertFindLastRecurrences(
Builder.insert(MaskPHI);
// Add select for mask
- Builder.setInsertPoint(SR);
+ Builder.setInsertPoint(SelectR);
VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond});
VPValue *MaskSelect = Builder.createSelect(AnyOf, Cond, MaskPHI);
MaskPHI->addOperand(MaskSelect);
// Replace select for data
- VPValue *DataSelect = Builder.createSelect(
- AnyOf, SR->getOperand(1), SR->getOperand(2), SR->getDebugLoc());
- SR->replaceAllUsesWith(DataSelect);
- SR->eraseFromParent();
+ VPValue *DataSelect =
+ Builder.createSelect(AnyOf, SelectR->getOperand(1),
+ SelectR->getOperand(2), SelectR->getDebugLoc());
+ SelectR->replaceAllUsesWith(DataSelect);
+ SelectR->eraseFromParent();
// Find final reduction computation and replace it with an
// extract.last.active intrinsic.
VPInstruction *RdxResult = findComputeReductionResult(PhiR);
- assert(RdxResult && "Unable to find Reduction Result Recipe");
+ if (!RdxResult)
+ return false;
Builder.setInsertPoint(RdxResult);
auto *ExtractLastActive =
Builder.createNaryOp(VPInstruction::ExtractLastActive,
@@ -1063,4 +1078,6 @@ void VPlanTransforms::convertFindLastRecurrences(
RdxResult->replaceAllUsesWith(ExtractLastActive);
RdxResult->eraseFromParent();
}
+
+ return true;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 39c87ae76941a..2d6222f776588 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -161,6 +161,14 @@ struct VPlanTransforms {
/// this attempt was unsuccessful.
static bool handleMaxMinNumReductions(VPlan &Plan);
+ /// Check if \p Plan contains any FindLast reductions. If it does, try to
+ /// update the vector loop to save the appropriate state using selects
+ /// for entire vectors for both the latest mask containing at least one active
+ /// element and the corresponding data vector. Return false if this attempt
+ /// was unsuccessful.
+ static bool handleFindLastReductions(VPlan &Plan,
+ VPRecipeBuilder &RecipeBuilder);
+
/// Clear NSW/NUW flags from reduction instructions if necessary.
static void clearReductionWrapFlags(VPlan &Plan);
@@ -390,12 +398,6 @@ struct VPlanTransforms {
/// users in the original exit block using the VPIRInstruction wrapping to the
/// LCSSA phi.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);
-
- /// Change FindLast reductions to save the appropriate state using selects
- /// for entire vectors for both the latest mask containing at least one active
- /// element and the corresponding data vector.
- static void convertFindLastRecurrences(VPlan &Plan,
- VPRecipeBuilder &RecipeBuilder);
};
} // namespace llvm
>From bdc89ed3627fe3a6d939f699ad5f7da48f59bbf9 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 25 Nov 2025 14:00:43 +0000
Subject: [PATCH 18/21] Use Plan.getFalse()
---
llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index d7ada10b15896..e3a294e21b2ac 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1047,9 +1047,7 @@ bool VPlanTransforms::handleFindLastReductions(VPlan &Plan,
// Add mask phi
VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR);
- VPValue *False = Plan.getOrAddLiveIn(
- ConstantInt::getFalse(PhiR->getUnderlyingValue()->getContext()));
- auto *MaskPHI = new VPWidenPHIRecipe(nullptr, False, DebugLoc());
+ auto *MaskPHI = new VPWidenPHIRecipe(nullptr, Plan.getFalse());
Builder.insert(MaskPHI);
// Add select for mask
>From d61fd944455b68559dc8d3fade21b64e20eb4c02 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 25 Nov 2025 14:43:14 +0000
Subject: [PATCH 19/21] Add a test case with extra arithmetic operations
---
.../AArch64/conditional-scalar-assignment.ll | 153 ++++++++++++++++++
1 file changed, 153 insertions(+)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
index e777659f003c8..564ecf1ca2230 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
@@ -464,3 +464,156 @@ loop:
exit:
ret i32 %select.data
}
+
+;; Add more work to the loop besides the CSA to check cost modelling for NEON.
+define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr readonly %B, ptr noalias %C, i32 %a) {
+; NEON-LABEL: define i32 @int_select_with_extra_arith_payload(
+; NEON-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[A:%.*]]) {
+; NEON-NEXT: [[ENTRY:.*]]:
+; NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NEON: [[VECTOR_PH]]:
+; NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; NEON-NEXT: br label %[[VECTOR_BODY:.*]]
+; NEON: [[VECTOR_BODY]]:
+; NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; NEON-NEXT: [[TMP0:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; NEON-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; NEON-NEXT: [[TMP2:%.*]] = mul <4 x i32> [[WIDE_LOAD]], splat (i32 13)
+; NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; NEON-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[WIDE_LOAD1]], splat (i32 5)
+; NEON-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; NEON-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; NEON-NEXT: [[TMP7:%.*]] = icmp slt <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; NEON-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
+; NEON-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
+; NEON-NEXT: [[TMP10]] = select i1 [[TMP9]], <4 x i1> [[TMP7]], <4 x i1> [[TMP0]]
+; NEON-NEXT: [[TMP11]] = select i1 [[TMP9]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]]
+; NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; NEON-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NEON-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NEON: [[MIDDLE_BLOCK]]:
+; NEON-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP11]], <4 x i1> [[TMP10]], i32 -1)
+; NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NEON-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NEON: [[SCALAR_PH]]:
+; NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
+; NEON-NEXT: br label %[[LOOP:.*]]
+; NEON: [[LOOP]]:
+; NEON-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[A_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_A:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NEON-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; NEON-NEXT: [[MUL_A:%.*]] = mul i32 [[LD_A]], 13
+; NEON-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NEON-NEXT: [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
+; NEON-NEXT: [[MUL_B:%.*]] = mul i32 [[LD_B]], 5
+; NEON-NEXT: [[ADD:%.*]] = add i32 [[MUL_A]], [[MUL_B]]
+; NEON-NEXT: [[C_ADDR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; NEON-NEXT: store i32 [[ADD]], ptr [[C_ADDR]], align 4
+; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD_A]]
+; NEON-NEXT: [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]]
+; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; NEON: [[EXIT]]:
+; NEON-NEXT: [[SELECT_A_LCSSA:%.*]] = phi i32 [ [[SELECT_A]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
+; NEON-NEXT: ret i32 [[SELECT_A_LCSSA]]
+;
+; SVE-LABEL: define i32 @int_select_with_extra_arith_payload(
+; SVE-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[A:%.*]]) #[[ATTR0]] {
+; SVE-NEXT: [[ENTRY:.*]]:
+; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; SVE: [[VECTOR_PH]]:
+; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SVE-NEXT: br label %[[VECTOR_BODY:.*]]
+; SVE: [[VECTOR_BODY]]:
+; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; SVE-NEXT: [[TMP6:%.*]] = mul <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 13)
+; SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
+; SVE-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i32> [[WIDE_LOAD1]], splat (i32 5)
+; SVE-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i32> [[TMP6]], [[TMP8]]
+; SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; SVE-NEXT: store <vscale x 4 x i32> [[TMP9]], ptr [[TMP10]], align 4
+; SVE-NEXT: [[TMP11:%.*]] = icmp slt <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; SVE-NEXT: [[TMP12:%.*]] = freeze <vscale x 4 x i1> [[TMP11]]
+; SVE-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP12]])
+; SVE-NEXT: [[TMP14]] = select i1 [[TMP13]], <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> [[TMP4]]
+; SVE-NEXT: [[TMP15]] = select i1 [[TMP13]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
+; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; SVE-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SVE-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SVE: [[MIDDLE_BLOCK]]:
+; SVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x i1> [[TMP14]], i32 -1)
+; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; SVE: [[SCALAR_PH]]:
+; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
+; SVE-NEXT: br label %[[LOOP:.*]]
+; SVE: [[LOOP]]:
+; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[A_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_A:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; SVE-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; SVE-NEXT: [[MUL_A:%.*]] = mul i32 [[LD_A]], 13
+; SVE-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; SVE-NEXT: [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
+; SVE-NEXT: [[MUL_B:%.*]] = mul i32 [[LD_B]], 5
+; SVE-NEXT: [[ADD:%.*]] = add i32 [[MUL_A]], [[MUL_B]]
+; SVE-NEXT: [[C_ADDR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; SVE-NEXT: store i32 [[ADD]], ptr [[C_ADDR]], align 4
+; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD_A]]
+; SVE-NEXT: [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]]
+; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; SVE: [[EXIT]]:
+; SVE-NEXT: [[SELECT_A_LCSSA:%.*]] = phi i32 [ [[SELECT_A]], %[[LOOP]] ], [ [[TMP17]], %[[MIDDLE_BLOCK]] ]
+; SVE-NEXT: ret i32 [[SELECT_A_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %A.phi = phi i32 [ -1, %entry ], [ %select.A, %loop ]
+ %A.addr = getelementptr inbounds i32, ptr %A, i64 %iv
+ %ld.A = load i32, ptr %A.addr, align 4
+ %mul.A = mul i32 %ld.A, 13
+ %B.addr = getelementptr inbounds i32, ptr %B, i64 %iv
+ %ld.B = load i32, ptr %B.addr, align 4
+ %mul.B = mul i32 %ld.B, 5
+ %add = add i32 %mul.A, %mul.B
+ %C.addr = getelementptr inbounds i32, ptr %C, i64 %iv
+ store i32 %add, ptr %C.addr, align 4
+ %select.cmp = icmp slt i32 %a, %ld.A
+ %select.A = select i1 %select.cmp, i32 %ld.A, i32 %A.phi
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cmp = icmp eq i64 %iv.next, %N
+ br i1 %exit.cmp, label %exit, label %loop
+
+exit:
+ ret i32 %select.A
+}
>From 142d881bbffd0cd9fdec63f38973b424e4e80afd Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 26 Nov 2025 11:50:44 +0000
Subject: [PATCH 20/21] Adjust tests post-rebase
---
.../AArch64/conditional-scalar-assignment.ll | 16 +++++-----
.../conditional-scalar-assignment-vplan.ll | 2 +-
.../LoopVectorize/iv-select-cmp-decreasing.ll | 32 +++++++++----------
.../iv-select-cmp-non-const-iv-start.ll | 8 ++---
.../LoopVectorize/iv-select-cmp-trunc.ll | 24 +++++++-------
5 files changed, 41 insertions(+), 41 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
index 564ecf1ca2230..73ba412c10249 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
@@ -466,16 +466,16 @@ exit:
}
;; Add more work to the loop besides the CSA to check cost modelling for NEON.
-define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr readonly %B, ptr noalias %C, i32 %a) {
+define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr readonly %B, ptr noalias %C, i32 %threshold) {
; NEON-LABEL: define i32 @int_select_with_extra_arith_payload(
-; NEON-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[A:%.*]]) {
+; NEON-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[THRESHOLD:%.*]]) {
; NEON-NEXT: [[ENTRY:.*]]:
; NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
; NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; NEON: [[VECTOR_PH]]:
; NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
; NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[THRESHOLD]], i64 0
; NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; NEON-NEXT: br label %[[VECTOR_BODY:.*]]
; NEON: [[VECTOR_BODY]]:
@@ -519,7 +519,7 @@ define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr rea
; NEON-NEXT: [[ADD:%.*]] = add i32 [[MUL_A]], [[MUL_B]]
; NEON-NEXT: [[C_ADDR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
; NEON-NEXT: store i32 [[ADD]], ptr [[C_ADDR]], align 4
-; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD_A]]
+; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[THRESHOLD]], [[LD_A]]
; NEON-NEXT: [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]]
; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
@@ -529,7 +529,7 @@ define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr rea
; NEON-NEXT: ret i32 [[SELECT_A_LCSSA]]
;
; SVE-LABEL: define i32 @int_select_with_extra_arith_payload(
-; SVE-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[A:%.*]]) #[[ATTR0]] {
+; SVE-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] {
; SVE-NEXT: [[ENTRY:.*]]:
; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
@@ -540,7 +540,7 @@ define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr rea
; SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[THRESHOLD]], i64 0
; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; SVE-NEXT: br label %[[VECTOR_BODY:.*]]
; SVE: [[VECTOR_BODY]]:
@@ -584,7 +584,7 @@ define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr rea
; SVE-NEXT: [[ADD:%.*]] = add i32 [[MUL_A]], [[MUL_B]]
; SVE-NEXT: [[C_ADDR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
; SVE-NEXT: store i32 [[ADD]], ptr [[C_ADDR]], align 4
-; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD_A]]
+; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[THRESHOLD]], [[LD_A]]
; SVE-NEXT: [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]]
; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
@@ -608,7 +608,7 @@ loop:
%add = add i32 %mul.A, %mul.B
%C.addr = getelementptr inbounds i32, ptr %C, i64 %iv
store i32 %add, ptr %C.addr, align 4
- %select.cmp = icmp slt i32 %a, %ld.A
+ %select.cmp = icmp slt i32 %threshold, %ld.A
%select.A = select i1 %select.cmp, i32 %ld.A, i32 %A.phi
%iv.next = add nuw nsw i64 %iv, 1
%exit.cmp = icmp eq i64 %iv.next, %N
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
index 64dc57f7ec492..788b35e88734e 100644
--- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
@@ -49,7 +49,7 @@ exit:
; CHECK-NEXT: WIDEN-PHI vp<[[MASKPHI:%.*]]> = phi [ ir<false>, vector.ph ], [ vp<[[MASKSELECT:%.*]]>, vector.body ]
; CHECK-NEXT: vp<[[STEPS:%.*]]> = SCALAR-STEPS vp<[[CIV]]>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: CLONE ir<[[LDADDR:%.*]]> = getelementptr inbounds ir<%data>, vp<[[STEPS:%.*]]>
-; CHECK-NEXT: vp<[[VPTR:%.*]]> = vector-pointer ir<[[LDADDR]]>
+; CHECK-NEXT: vp<[[VPTR:%.*]]> = vector-pointer inbounds ir<[[LDADDR]]>
; CHECK-NEXT: WIDEN ir<[[LD:%.*]]> = load vp<[[VPTR]]>
; CHECK-NEXT: WIDEN ir<[[SELECTCMP:%.*]]> = icmp slt ir<%a>, ir<[[LD]]>
; CHECK-NEXT: EMIT vp<[[ANYOF:%.*]]> = any-of ir<[[SELECTCMP]]>
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
index cdd76957afcf1..890b6ccba0796 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
@@ -982,13 +982,13 @@ define i64 @select_decreasing_induction_icmp_iv_out_of_bound(ptr %a, ptr %b, i64
; IC1VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
; IC1VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
; IC1VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
-; IC1VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; IC1VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -3
+; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+; IC1VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 -3
; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
-; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -3
+; IC1VF4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 0
+; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i64 -3
; IC1VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
; IC1VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC1VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]]
@@ -1038,13 +1038,13 @@ define i64 @select_decreasing_induction_icmp_iv_out_of_bound(ptr %a, ptr %b, i64
; IC4VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
; IC4VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
; IC4VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
-; IC4VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; IC4VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -3
+; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+; IC4VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 -3
; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
-; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -3
+; IC4VF4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 0
+; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i64 -3
; IC4VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
; IC4VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC4VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]]
@@ -1145,13 +1145,13 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
; IC1VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1)
; IC1VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
+; IC1VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 0
+; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 -3
; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC1VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; IC1VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; IC1VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 -3
+; IC1VF4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 0
+; IC1VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP19]], i64 -3
; IC1VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
; IC1VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC1VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]]
@@ -1214,13 +1214,13 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
; IC4VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1)
; IC4VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
+; IC4VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 0
+; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 -3
; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC4VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; IC4VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; IC4VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 -3
+; IC4VF4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 0
+; IC4VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP19]], i64 -3
; IC4VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
; IC4VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC4VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]]
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
index 115956b977cee..88bb91efa0410 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
@@ -192,7 +192,7 @@ define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start,
; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0
; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK-VF4IC1: [[VECTOR_BODY]]:
; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -208,7 +208,7 @@ define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start,
; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -257,7 +257,7 @@ define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start,
; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0
; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK-VF4IC4: [[VECTOR_BODY]]:
; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -273,7 +273,7 @@ define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start,
; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
index ad5189c35cfd7..839ea7ce7e7a4 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
@@ -709,7 +709,7 @@ define i32 @select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) {
; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -763,7 +763,7 @@ define i32 @select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) {
; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
@@ -874,7 +874,7 @@ define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) {
; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -929,7 +929,7 @@ define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) {
; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
@@ -1037,7 +1037,7 @@ define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) {
; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512
; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -1066,7 +1066,7 @@ define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) {
; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512
; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
@@ -1143,7 +1143,7 @@ define i32 @not_vectorized_select_iv_icmp_no_guard(ptr %a, ptr %b, i32 %start, i
; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -1199,7 +1199,7 @@ define i32 @not_vectorized_select_iv_icmp_no_guard(ptr %a, ptr %b, i32 %start, i
; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
@@ -1296,7 +1296,7 @@ define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) {
; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -1338,7 +1338,7 @@ define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) {
; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
@@ -1433,7 +1433,7 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p
; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
@@ -1495,7 +1495,7 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p
; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]]
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
>From fce52d0f5755338b1de333f5386cfaaf1e657ced Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 26 Nov 2025 15:47:38 +0000
Subject: [PATCH 21/21] Unify Find recurrence detection.
---
llvm/include/llvm/Analysis/IVDescriptors.h | 9 ++-
llvm/lib/Analysis/IVDescriptors.cpp | 61 ++++++-------------
llvm/lib/Transforms/Utils/LoopUnroll.cpp | 5 +-
llvm/lib/Transforms/Utils/LoopUtils.cpp | 2 +-
.../Transforms/Vectorize/LoopVectorize.cpp | 10 +--
5 files changed, 33 insertions(+), 54 deletions(-)
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index e2d99ef629af2..4fd9aaa9280f2 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -181,9 +181,8 @@ class RecurrenceDescriptor {
/// where one of (X, Y) is an increasing (FindLastIV) or decreasing
/// (FindFirstIV) loop induction variable, or an arbitrary integer value
/// (FindLast), and the other is a PHI value.
- LLVM_ABI static InstDesc isFindPattern(RecurKind Kind, Loop *TheLoop,
- PHINode *OrigPhi, Instruction *I,
- ScalarEvolution &SE);
+ LLVM_ABI static InstDesc isFindPattern(Loop *TheLoop, PHINode *OrigPhi,
+ Instruction *I, ScalarEvolution &SE);
/// Returns a struct describing if the instruction is a
/// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
@@ -314,6 +313,10 @@ class RecurrenceDescriptor {
return Kind == RecurKind::FindLast;
}
+ static bool isFindRecurrenceKind(RecurKind Kind) {
+ return isFindLastRecurrenceKind(Kind) || isFindIVRecurrenceKind(Kind);
+ }
+
/// Returns the type of the recurrence. This type can be narrower than the
/// actual type of the Phi if the recurrence has been type-promoted.
Type *getRecurrenceType() const { return RecurrenceType; }
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 3eaafd41ada92..d2e86306de1ca 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -707,9 +707,8 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi,
// will either be the initial value (0) if the condition was never met, or the
// value of a[i] in the most recent loop iteration where the condition was met.
RecurrenceDescriptor::InstDesc
-RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop,
- PHINode *OrigPhi, Instruction *I,
- ScalarEvolution &SE) {
+RecurrenceDescriptor::isFindPattern(Loop *TheLoop, PHINode *OrigPhi,
+ Instruction *I, ScalarEvolution &SE) {
// TODO: Support the vectorization of FindLastIV when the reduction phi is
// used by more than one select instruction. This vectorization is only
// performed when the SCEV of each increasing induction variable used by the
@@ -730,17 +729,6 @@ RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop,
m_Value(NonRdxPhi)))))
return InstDesc(false, I);
- if (isFindLastRecurrenceKind(Kind)) {
- // Must be an integer scalar.
- Type *Type = OrigPhi->getType();
- if (!Type->isIntegerTy())
- return InstDesc(false, I);
-
- // FIXME: Support more complex patterns, including multiple selects.
- // The Select must be used only outside the loop and by the PHI.
- return InstDesc(I, RecurKind::FindLast);
- }
-
// Returns either FindFirstIV/FindLastIV, if such a pattern is found, or
// std::nullopt.
auto GetRecurKind = [&](Value *V) -> std::optional<RecurKind> {
@@ -754,8 +742,9 @@ RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop,
m_SpecificLoop(TheLoop))))
return std::nullopt;
- if ((isFindFirstIVRecurrenceKind(Kind) && !SE.isKnownNegative(Step)) ||
- (isFindLastIVRecurrenceKind(Kind) && !SE.isKnownPositive(Step)))
+ // We must have a known positive or negative step for FindIV
+ const bool PositiveStep = SE.isKnownPositive(Step);
+ if (!PositiveStep && !SE.isKnownNegative(Step))
return std::nullopt;
// Check if the minimum (FindLast) or maximum (FindFirst) value of the
@@ -771,7 +760,7 @@ RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop,
IsSigned ? SE.getSignedRange(AR) : SE.getUnsignedRange(AR);
unsigned NumBits = Ty->getIntegerBitWidth();
ConstantRange ValidRange = ConstantRange::getEmpty(NumBits);
- if (isFindLastIVRecurrenceKind(Kind)) {
+ if (PositiveStep) {
APInt Sentinel = IsSigned ? APInt::getSignedMinValue(NumBits)
: APInt::getMinValue(NumBits);
ValidRange = ConstantRange::getNonEmpty(Sentinel + 1, Sentinel);
@@ -785,26 +774,22 @@ RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop,
APInt::getMinValue(NumBits), APInt::getMaxValue(NumBits) - 1);
}
- LLVM_DEBUG(dbgs() << "LV: "
- << (isFindLastIVRecurrenceKind(Kind) ? "FindLastIV"
- : "FindFirstIV")
- << " valid range is " << ValidRange
- << ", and the range of " << *AR << " is " << IVRange
- << "\n");
+ LLVM_DEBUG(
+ dbgs() << "LV: " << (PositiveStep ? "FindLastIV" : "FindFirstIV")
+ << " valid range is " << ValidRange << ", and the range of "
+ << *AR << " is " << IVRange << "\n");
// Ensure the induction variable does not wrap around by verifying that
// its range is fully contained within the valid range.
return ValidRange.contains(IVRange);
};
- if (isFindLastIVRecurrenceKind(Kind)) {
+ if (PositiveStep) {
if (CheckRange(true))
return RecurKind::FindLastIVSMax;
if (CheckRange(false))
return RecurKind::FindLastIVUMax;
return std::nullopt;
}
- assert(isFindFirstIVRecurrenceKind(Kind) &&
- "Kind must either be a FindLastIV or FindFirstIV");
if (CheckRange(true))
return RecurKind::FindFirstIVSMin;
@@ -816,7 +801,8 @@ RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop,
if (auto RK = GetRecurKind(NonRdxPhi))
return InstDesc(I, *RK);
- return InstDesc(false, I);
+ // If the recurrence is not specific to an IV, return a generic FindLast.
+ return InstDesc(I, RecurKind::FindLast);
}
RecurrenceDescriptor::InstDesc
@@ -950,8 +936,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
Kind == RecurKind::Add || Kind == RecurKind::Mul ||
Kind == RecurKind::Sub || Kind == RecurKind::AddChainWithSubs)
return isConditionalRdxPattern(I);
- if ((isFindIVRecurrenceKind(Kind) || isFindLastRecurrenceKind(Kind)) && SE)
- return isFindPattern(Kind, L, OrigPhi, I, *SE);
+ if (isFindRecurrenceKind(Kind) && SE)
+ return isFindPattern(L, OrigPhi, I, *SE);
[[fallthrough]];
case Instruction::FCmp:
case Instruction::ICmp:
@@ -1091,14 +1077,9 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
<< "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::FindLastIVSMax, TheLoop, FMF, RedDes, DB,
- AC, DT, SE)) {
- LLVM_DEBUG(dbgs() << "Found a FindLastIV reduction PHI." << *Phi << "\n");
- return true;
- }
- if (AddReductionVar(Phi, RecurKind::FindFirstIVSMin, TheLoop, FMF, RedDes, DB,
- AC, DT, SE)) {
- LLVM_DEBUG(dbgs() << "Found a FindFirstIV reduction PHI." << *Phi << "\n");
+ if (AddReductionVar(Phi, RecurKind::FindLast, TheLoop, FMF, RedDes, DB, AC,
+ DT, SE)) {
+ LLVM_DEBUG(dbgs() << "Found a Find reduction PHI." << *Phi << "\n");
return true;
}
if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT,
@@ -1148,11 +1129,6 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
<< "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::FindLast, TheLoop, FMF, RedDes, DB, AC,
- DT, SE)) {
- LLVM_DEBUG(dbgs() << "Found a FindLast reduction PHI." << *Phi << "\n");
- return true;
- }
// Not a reduction of known type.
return false;
}
@@ -1278,7 +1254,6 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
case RecurKind::FMinimumNum:
return Instruction::FCmp;
case RecurKind::FindLast:
- return Instruction::Select;
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
case RecurKind::FindFirstIVUMin:
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 5b6c311a0dcff..16237a3f87279 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -1259,9 +1259,8 @@ llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
// reductions.
if (!RecurrenceDescriptor::isIntegerRecurrenceKind(RK) ||
RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
- RecurrenceDescriptor::isFindIVRecurrenceKind(RK) ||
- RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
- RecurrenceDescriptor::isFindLastRecurrenceKind(RK))
+ RecurrenceDescriptor::isFindRecurrenceKind(RK) ||
+ RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
return std::nullopt;
if (RdxDesc.IntermediateStore)
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 8e2a4f80fce16..50c78c5d22d3c 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1491,7 +1491,7 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
RecurKind Kind, Value *Mask, Value *EVL) {
assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
- !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
+ !RecurrenceDescriptor::isFindRecurrenceKind(Kind) &&
"AnyOf and FindIV reductions are not supported.");
Intrinsic::ID Id = getReductionIntrinsicID(Kind);
auto VPID = VPIntrinsic::getForIntrinsic(Id);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4153140d4d02a..12f003a54e61c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4617,10 +4617,12 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
IsaPred<VPReductionPHIRecipe>);
// FIXME: implement interleaving for FindLast transform correctly.
- for (auto &[_, RdxDesc] : Legal->getReductionVars())
- if (RecurrenceDescriptor::isFindLastRecurrenceKind(
- RdxDesc.getRecurrenceKind()))
- return 1;
+ if (any_of(make_second_range(Legal->getReductionVars()),
+ [](const RecurrenceDescriptor &RdxDesc) {
+ return RecurrenceDescriptor::isFindLastRecurrenceKind(
+ RdxDesc.getRecurrenceKind());
+ }))
+ return 1;
// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
More information about the llvm-commits
mailing list