[llvm] [LV] Vectorize FindLastIV/FindFirstIV reductions without sentinel. (WIP) (PR #172569)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Sat Jan 3 14:49:11 PST 2026
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/172569
>From ddd85e351178aacfddf90960f9ed072f21da69b4 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 4 Dec 2025 21:01:23 +0000
Subject: [PATCH] [LV] Vectorize FindLastIV/FindFirstIV reductions without
sentinel. (WIP)
This patch generalizes FindLastIV/FindFirstIV handling to support cases
without a sentinel value by tracking if the condition was ever true in
the loop.
To do so, an additional AnyOf reduction is added to track the condition
in the loop. It is then used to select the start value, if the condition
wasn't true in the loop.
We still use sentinel values, if it is safe to do so. This is now done
through a new VPlan-based optimization.
Still marked as WIP, as there are a few changes that can be split off.
---
.../include/llvm/Transforms/Utils/LoopUtils.h | 6 -
llvm/lib/Analysis/IVDescriptors.cpp | 56 +-
llvm/lib/Transforms/Utils/LoopUtils.cpp | 16 -
.../Transforms/Vectorize/LoopVectorize.cpp | 136 ++++-
.../Transforms/Vectorize/VPlanPatternMatch.h | 6 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 62 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 94 +++
.../Transforms/Vectorize/VPlanTransforms.h | 6 +
llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 14 +-
.../AArch64/epilog-iv-select-cmp.ll | 38 ++
.../LoopVectorize/iv-select-cmp-decreasing.ll | 558 ++++++++++++++++--
.../LoopVectorize/iv-select-cmp-no-wrap.ll | 43 +-
.../iv-select-cmp-non-const-iv-start.ll | 272 ++++++++-
.../LoopVectorize/iv-select-cmp-trunc.ll | 558 ++++++++++++------
.../Transforms/LoopVectorize/iv-select-cmp.ll | 439 ++++++++++++--
.../LoopVectorize/select-smax-last-index.ll | 24 +-
.../LoopVectorize/select-umax-last-index.ll | 24 +-
.../vplan-printing-reductions.ll | 4 +-
18 files changed, 1918 insertions(+), 438 deletions(-)
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 0afba21dfaf81..42ed3c659edd9 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -498,12 +498,6 @@ LLVM_ABI Value *createSimpleReduction(IRBuilderBase &B, Value *Src,
LLVM_ABI Value *createAnyOfReduction(IRBuilderBase &B, Value *Src,
Value *InitVal, PHINode *OrigPhi);
-/// Create a reduction of the given vector \p Src for a reduction of the
-/// kind RecurKind::FindLastIV.
-LLVM_ABI Value *createFindLastIVReduction(IRBuilderBase &B, Value *Src,
- RecurKind RdxKind, Value *Start,
- Value *Sentinel);
-
/// Create an ordered reduction intrinsic using the given recurrence
/// kind \p RdxKind.
LLVM_ABI Value *createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind,
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 7624e0ed6f2b0..7d380e204514f 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -784,57 +784,33 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
(isFindLastIVRecurrenceKind(Kind) && !SE.isKnownPositive(Step)))
return std::nullopt;
- // Check if the minimum (FindLast) or maximum (FindFirst) value of the
- // recurrence type can be used as a sentinel value. The maximum acceptable
- // range for the induction variable, called the valid range will exclude
- // <sentinel value>, where <sentinel value> is
- // [Signed|Unsigned]Min(<recurrence type>) for FindLastIV or
- // [Signed|Unsigned]Max(<recurrence type>) for FindFirstIV.
- // TODO: This range restriction can be lifted by adding an additional
- // virtual OR reduction.
- auto CheckRange = [&](bool IsSigned) {
+ // The IV must not wrap for the reduction to be correct (smax/smin requires
+ // monotonic IV). Check if signed or unsigned variant is safe and return
+ // the appropriate recurrence kind.
+ auto *AddRec = cast<SCEVAddRecExpr>(AR);
+ auto IsSafeForVariant = [&](bool IsSigned) {
+ if (IsSigned ? AddRec->hasNoSignedWrap() : AddRec->hasNoUnsignedWrap())
+ return true;
+
+ // If the range doesn't wrap, the IV is monotonic.
const ConstantRange IVRange =
IsSigned ? SE.getSignedRange(AR) : SE.getUnsignedRange(AR);
- unsigned NumBits = Ty->getIntegerBitWidth();
- ConstantRange ValidRange = ConstantRange::getEmpty(NumBits);
- if (isFindLastIVRecurrenceKind(Kind)) {
- APInt Sentinel = IsSigned ? APInt::getSignedMinValue(NumBits)
- : APInt::getMinValue(NumBits);
- ValidRange = ConstantRange::getNonEmpty(Sentinel + 1, Sentinel);
- } else {
- if (IsSigned)
- ValidRange =
- ConstantRange::getNonEmpty(APInt::getSignedMinValue(NumBits),
- APInt::getSignedMaxValue(NumBits) - 1);
- else
- ValidRange = ConstantRange::getNonEmpty(
- APInt::getMinValue(NumBits), APInt::getMaxValue(NumBits) - 1);
- }
-
- LLVM_DEBUG(dbgs() << "LV: "
- << (isFindLastIVRecurrenceKind(Kind) ? "FindLastIV"
- : "FindFirstIV")
- << " valid range is " << ValidRange
- << ", and the range of " << *AR << " is " << IVRange
- << "\n");
-
- // Ensure the induction variable does not wrap around by verifying that
- // its range is fully contained within the valid range.
- return ValidRange.contains(IVRange);
+ return !IVRange.isFullSet() &&
+ !(IsSigned ? IVRange.isSignWrappedSet() : IVRange.isWrappedSet());
};
+
if (isFindLastIVRecurrenceKind(Kind)) {
- if (CheckRange(true))
+ if (IsSafeForVariant(/*IsSigned=*/true))
return RecurKind::FindLastIVSMax;
- if (CheckRange(false))
+ if (IsSafeForVariant(/*IsSigned=*/false))
return RecurKind::FindLastIVUMax;
return std::nullopt;
}
assert(isFindFirstIVRecurrenceKind(Kind) &&
"Kind must either be a FindLastIV or FindFirstIV");
-
- if (CheckRange(true))
+ if (IsSafeForVariant(/*IsSigned=*/true))
return RecurKind::FindFirstIVSMin;
- if (CheckRange(false))
+ if (IsSafeForVariant(/*IsSigned=*/false))
return RecurKind::FindFirstIVUMin;
return std::nullopt;
};
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 8fb75d80a6aa1..749ff98ad0066 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1386,22 +1386,6 @@ Value *llvm::createAnyOfReduction(IRBuilderBase &Builder, Value *Src,
return Builder.CreateSelect(AnyOf, NewVal, InitVal, "rdx.select");
}
-Value *llvm::createFindLastIVReduction(IRBuilderBase &Builder, Value *Src,
- RecurKind RdxKind, Value *Start,
- Value *Sentinel) {
- bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(RdxKind);
- bool IsMaxRdx = RecurrenceDescriptor::isFindLastIVRecurrenceKind(RdxKind);
- Value *MaxRdx = Src->getType()->isVectorTy()
- ? (IsMaxRdx ? Builder.CreateIntMaxReduce(Src, IsSigned)
- : Builder.CreateIntMinReduce(Src, IsSigned))
- : Src;
- // Correct the final reduction result back to the start value if the maximum
- // reduction is sentinel value.
- Value *Cmp =
- Builder.CreateCmp(CmpInst::ICMP_NE, MaxRdx, Sentinel, "rdx.select.cmp");
- return Builder.CreateSelect(Cmp, MaxRdx, Start, "rdx.select");
-}
-
Value *llvm::getReductionIdentity(Intrinsic::ID RdxID, Type *Ty,
FastMathFlags Flags) {
bool Negative = false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 73d723e781483..c0553e9c8eba0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7326,17 +7326,17 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
"start value");
MainResumeValue = Cmp->getOperand(0);
} else if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind)) {
- Value *StartV = getStartValueFromReductionResult(EpiRedResult);
- Value *SentinelV = EpiRedResult->getOperand(2)->getLiveInIRValue();
using namespace llvm::PatternMatch;
Value *Cmp, *OrigResumeV, *CmpOp;
+
+ // The MainResumeValue should be: select(Cmp, Sentinel, OrigResumeV)
+ // where Cmp = icmp eq OrigResumeV, freeze
[[maybe_unused]] bool IsExpectedPattern =
match(MainResumeValue,
- m_Select(m_OneUse(m_Value(Cmp)), m_Specific(SentinelV),
- m_Value(OrigResumeV))) &&
- (match(Cmp, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
- m_Value(CmpOp))) &&
- ((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(CmpOp))));
+ m_Select(m_Value(Cmp), m_Value(), m_Value(OrigResumeV))) &&
+ match(Cmp, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
+ m_Value(CmpOp))) &&
+ (isa<FreezeInst>(CmpOp) || isGuaranteedNotToBeUndefOrPoison(CmpOp));
assert(IsExpectedPattern && "Unexpected reduction resume pattern");
MainResumeValue = OrigResumeV;
}
@@ -8542,6 +8542,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
addReductionResultComputation(Plan, RecipeBuilder, Range.Start);
+ // Optimize FindIV reductions to use sentinel-based approach when possible.
+ VPlanTransforms::runPass(VPlanTransforms::optimizeFindIVReductions, *Plan,
+ PSE, *OrigLoop);
+
// Apply mandatory transformation to handle reductions with multiple in-loop
// uses if possible, bail out otherwise.
if (!VPlanTransforms::runPass(VPlanTransforms::handleMultiUseReductions,
@@ -8656,12 +8660,12 @@ void LoopVectorizationPlanner::addReductionResultComputation(
VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
Builder.setInsertPoint(&*std::prev(std::prev(LatchVPBB->end())));
VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
- for (VPRecipeBase &R :
- Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
- VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
+
+ for (VPRecipeBase *R : to_vector(make_pointer_range(
+ VectorLoopRegion->getEntryBasicBlock()->phis()))) {
+ auto *PhiR = dyn_cast<VPReductionPHIRecipe>(R);
if (!PhiR)
continue;
-
const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
cast<PHINode>(PhiR->getUnderlyingInstr()));
Type *PhiTy = TypeInfo.inferScalarType(PhiR);
@@ -8716,9 +8720,58 @@ void LoopVectorizationPlanner::addReductionResultComputation(
if (RecurrenceDescriptor::isFindIVRecurrenceKind(RecurrenceKind)) {
VPValue *Start = PhiR->getStartValue();
VPValue *Sentinel = Plan->getOrAddLiveIn(RdxDesc.getSentinelValue());
- FinalReductionResult =
- Builder.createNaryOp(VPInstruction::ComputeFindIVResult,
- {PhiR, Start, Sentinel, NewExitingVPV}, ExitDL);
+
+ // For FindIV, we need to set up an additional AnyOf reduction to track if
+ // the condition was ever true in the loop.
+ auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
+ return match(U, m_Select(m_VPValue(), m_VPValue(), m_VPValue()));
+ }));
+ VPValue *Cmp = Select->getOperand(0);
+
+ auto *AnyFoundPhi =
+ new VPReductionPHIRecipe(nullptr, RecurKind::AnyOf, *Plan->getFalse(),
+ *Plan->getFalse(), RdxUnordered{1});
+ AnyFoundPhi->insertAfter(PhiR);
+
+ VPBuilder LoopBuilder(Select);
+ // Track when the IV is selected (condition was true for finding an
+ // element). If the true branch returns PhiR, the IV is selected when
+ // condition is false, so negate.
+ VPValue *AnyFoundCmp = Cmp;
+ if (Select->getOperand(1) == PhiR)
+ AnyFoundCmp = LoopBuilder.createNot(Cmp);
+ VPValue *AnyFoundOr = LoopBuilder.createOr(AnyFoundPhi, AnyFoundCmp);
+ AnyFoundPhi->setOperand(1, AnyFoundOr);
+
+ VPValue *AnyOfResult =
+ Builder.createNaryOp(VPInstruction::AnyOf, {AnyFoundOr});
+ auto *FrozenAnyOf = Builder.createNaryOp(
+ Instruction::Freeze, {AnyOfResult}, {}, ExitDL, "any.found.fr");
+
+ // Create ComputeFindIVResult to compute the final reduced IV value.
+ FinalReductionResult = Builder.createNaryOp(
+ VPInstruction::ComputeFindIVResult,
+ {PhiR, Start, FrozenAnyOf, NewExitingVPV}, ExitDL);
+
+ VPBuilder PHBuilder(Plan->getVectorPreheader());
+ VPValue *ScaleFactorVPV = Plan->getConstantInt(32, 1);
+ VPValue *StartV =
+ PHBuilder.createNaryOp(VPInstruction::ReductionStartVector,
+ {Sentinel, Sentinel, ScaleFactorVPV},
+ FastMathFlags(), PhiR->getDebugLoc());
+ PhiR->setOperand(0, StartV);
+
+ // Update all users outside the vector region.
+ for (auto *U : to_vector(OrigExitingVPV->users())) {
+ auto *Parent = cast<VPRecipeBase>(U)->getParent();
+ // Skip the final result and recipes inside regions.
+ if (FinalReductionResult == U || Parent->getParent())
+ continue;
+ U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
+ if (match(U, m_CombineOr(m_ExtractLastLaneOfLastPart(m_VPValue()),
+ m_ExtractLane(m_VPValue(), m_VPValue()))))
+ cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
+ }
} else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
VPValue *Start = PhiR->getStartValue();
FinalReductionResult =
@@ -8814,13 +8867,6 @@ void LoopVectorizationPlanner::addReductionResultComputation(
continue;
}
- if (RecurrenceDescriptor::isFindIVRecurrenceKind(
- RdxDesc.getRecurrenceKind())) {
- // Adjust the start value for FindFirstIV/FindLastIV recurrences to use
- // the sentinel value after generating the ResumePhi recipe, which uses
- // the original start value.
- PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
- }
RecurKind RK = RdxDesc.getRecurrenceKind();
if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) &&
!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
@@ -9233,8 +9279,10 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
if (isa<VPCanonicalIVPHIRecipe>(&R))
continue;
- EpiWidenedPhis.insert(
- cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
+ auto *UV = R.getVPSingleValue()->getUnderlyingValue();
+ if (!UV)
+ continue;
+ EpiWidenedPhis.insert(cast<PHINode>(UV));
}
for (VPRecipeBase &R :
make_early_inc_range(MainPlan.getScalarHeader()->phis())) {
@@ -9377,6 +9425,14 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
Value *ResumeV = nullptr;
// TODO: Move setting of resume values to prepareToExecute.
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
+ // Skip AnyOf reductions with nullptr underlying value - these are
+ // synthesized for FindIV boolean tracking and will be handled when
+ // processing the associated FindIV reduction.
+ if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
+ ReductionPhi->getRecurrenceKind()) &&
+ !ReductionPhi->getUnderlyingValue())
+ continue;
+
auto *RdxResult =
cast<VPInstruction>(*find_if(ReductionPhi->users(), [](VPUser *U) {
auto *VPI = dyn_cast<VPInstruction>(U);
@@ -9403,6 +9459,14 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
ToFrozen[StartV] = cast<PHINode>(ResumeV)->getIncomingValueForBlock(
EPI.MainLoopIterationCountCheck);
+ // Get sentinel from the ReductionPHI's start value
+ // (ReductionStartVector).
+ auto *StartVPI = dyn_cast<VPInstruction>(ReductionPhi->getStartValue());
+ assert(StartVPI &&
+ StartVPI->getOpcode() == VPInstruction::ReductionStartVector &&
+ "expected ReductionStartVector");
+ Value *Sentinel = StartVPI->getOperand(0)->getLiveInIRValue();
+
// VPReductionPHIRecipe for FindFirstIV/FindLastIV reductions requires
// an adjustment to the resume value. The resume value is adjusted to
// the sentinel value when the final value from the main vector loop
@@ -9414,10 +9478,34 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
if (auto *I = dyn_cast<Instruction>(Cmp))
InstsToMove.push_back(I);
- Value *Sentinel = RdxResult->getOperand(2)->getLiveInIRValue();
ResumeV = Builder.CreateSelect(Cmp, Sentinel, ResumeV);
if (auto *I = dyn_cast<Instruction>(ResumeV))
InstsToMove.push_back(I);
+
+ // Find the synthesized AnyOf PHI that follows this FindIV reduction.
+ // It was inserted immediately after the FindIV phi in
+ // adjustRecipesForReductions.
+ auto *AnyOfPhi =
+ dyn_cast<VPReductionPHIRecipe>(ReductionPhi->getNextNode());
+ if (AnyOfPhi &&
+ RecurrenceDescriptor::isAnyOfRecurrenceKind(
+ AnyOfPhi->getRecurrenceKind()) &&
+ !AnyOfPhi->getUnderlyingValue()) {
+ // The resume value for AnyOf is whether the main loop found a match:
+ // ResumeV != FrozenStart. We already have Cmp = (ResumeV ==
+ // FrozenStart), so use NOT of that.
+ Value *AnyOfResume = Builder.CreateNot(Cmp);
+ if (auto *I = dyn_cast<Instruction>(AnyOfResume))
+ InstsToMove.push_back(I);
+ AnyOfPhi->setStartValue(Plan.getOrAddLiveIn(AnyOfResume));
+ }
+
+ // Replace the ReductionStartVector with the resume value directly.
+ // We can't just update the operand because ReductionStartVector creates
+ // insertelement(splat(sentinel), val, 0), but we need a full splat of
+ // the resume value for the epilog loop.
+ ReductionPhi->setStartValue(Plan.getOrAddLiveIn(ResumeV));
+ continue;
} else {
VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 944b1cc9a8f93..2b73379bfdff3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -546,6 +546,12 @@ m_c_Add(const Op0_t &Op0, const Op1_t &Op1) {
return m_c_Binary<Instruction::Add, Op0_t, Op1_t>(Op0, Op1);
}
+template <typename Op0_t, typename Op1_t>
+inline AllRecipe_match<Instruction::Or, Op0_t, Op1_t> m_Or(const Op0_t &Op0,
+ const Op1_t &Op1) {
+ return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
+}
+
template <typename Op0_t, typename Op1_t>
inline AllRecipe_match<Instruction::Sub, Op0_t, Op1_t> m_Sub(const Op0_t &Op0,
const Op1_t &Op1) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bf6969d6f4f78..ded4da37473f5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -164,6 +164,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
+ case VPReductionPHISC:
case VPPredInstPHISC:
case VPVectorEndPointerSC:
return false;
@@ -469,7 +470,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ReductionStartVector:
return 3;
case VPInstruction::ComputeFindIVResult:
- return 4;
+ return -1u;
case Instruction::Call:
case Instruction::GetElementPtr:
case Instruction::PHI:
@@ -728,24 +729,47 @@ Value *VPInstruction::generate(VPTransformState &State) {
assert(!PhiR->isInLoop() &&
"In-loop FindLastIV reduction is not supported yet");
- // The recipe's operands are the reduction phi, the start value, the
- // sentinel value, followed by one operand for each part of the reduction.
- unsigned UF = getNumOperands() - 3;
- Value *ReducedPartRdx = State.get(getOperand(3));
- RecurKind MinMaxKind;
bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(RK);
- if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
- MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax;
- else
- MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin;
+ bool IsFindLast = RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
+ RecurKind MinMaxKind = IsFindLast
+ ? (IsSigned ? RecurKind::SMax : RecurKind::UMax)
+ : (IsSigned ? RecurKind::SMin : RecurKind::UMin);
+
+ // The recipe's operands are:
+ // [PhiR, Start, CondOrSentinel, IV0, IV1, ...]
+ // CondOrSentinel is either:
+ // - A boolean (i1) indicating if any element was found
+ // - The sentinel value, in which case we compute result != sentinel
+ Value *Start = State.get(getOperand(1), true);
+ Value *CondOrSentinel = State.get(getOperand(2), true);
+
+ // Combine all IV parts using min/max.
+ unsigned UF = getNumOperands() - 3;
+ Value *CombinedIV = State.get(getOperand(3));
for (unsigned Part = 1; Part < UF; ++Part)
- ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
- State.get(getOperand(3 + Part)));
+ CombinedIV = createMinMaxOp(Builder, MinMaxKind, CombinedIV,
+ State.get(getOperand(3 + Part)));
+
+ // Reduce vector to scalar.
+ Value *ReducedIV =
+ CombinedIV->getType()->isVectorTy()
+ ? (IsFindLast ? Builder.CreateIntMaxReduce(CombinedIV, IsSigned)
+ : Builder.CreateIntMinReduce(CombinedIV, IsSigned))
+ : CombinedIV;
+
+ // Determine the condition for the select.
+ Value *Cond;
+ if (CondOrSentinel->getType()->isIntegerTy(1)) {
+ // CondOrSentinel is a boolean - use it directly.
+ Cond = CondOrSentinel;
+ } else {
+ // CondOrSentinel is the sentinel value - compute result != sentinel.
+ Cond = Builder.CreateICmpNE(ReducedIV, CondOrSentinel, "rdx.select.cmp");
+ }
- Value *Start = State.get(getOperand(1), true);
- Value *Sentinel = getOperand(2)->getLiveInIRValue();
- return createFindLastIVReduction(Builder, ReducedPartRdx, RK, Start,
- Sentinel);
+ // Select between the reduced IV and the start value based on whether
+ // any element was found.
+ return Builder.CreateSelect(Cond, ReducedIV, Start);
}
case VPInstruction::ComputeReductionResult: {
// FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
@@ -1107,6 +1131,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
Ctx.CostKind);
}
case VPInstruction::AnyOf: {
+ if (VF.isScalar())
+ return 0;
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
return Ctx.TTI.getArithmeticReductionCost(
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
@@ -1336,8 +1362,10 @@ bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
// WidePtrAdd supports scalar and vector base addresses.
return false;
case VPInstruction::ComputeAnyOfResult:
- case VPInstruction::ComputeFindIVResult:
return Op == getOperand(1);
+ case VPInstruction::ComputeFindIVResult:
+ // Operands 1 (Start) and 2 (FrozenAnyOf) use first lane only.
+ return Op == getOperand(1) || Op == getOperand(2);
case VPInstruction::ExtractLane:
return Op == getOperand(0);
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8f0cf7d5beeed..b55e10a5788ac 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5319,3 +5319,97 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
}
}
}
+
+void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
+ PredicatedScalarEvolution &PSE,
+ Loop &L) {
+ VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
+ ScalarEvolution &SE = *PSE.getSE();
+ VPTypeAnalysis TypeInfo(Plan);
+
+ bool MadeChanges = false;
+ for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
+ auto *FindIVResult = dyn_cast<VPInstruction>(&R);
+ if (!FindIVResult ||
+ FindIVResult->getOpcode() != VPInstruction::ComputeFindIVResult)
+ continue;
+
+ auto *PhiR = cast<VPReductionPHIRecipe>(FindIVResult->getOperand(0));
+ RecurKind RK = PhiR->getRecurrenceKind();
+ bool IsFindLast = RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
+ bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(RK);
+
+ // Get the IV from the backedge value of the reduction phi.
+ // The backedge value should be a select between the phi and the IV.
+ VPValue *BackedgeVal = PhiR->getBackedgeValue();
+ VPValue *TrueVal, *FalseVal;
+ if (!match(BackedgeVal,
+ m_Select(m_VPValue(), m_VPValue(TrueVal), m_VPValue(FalseVal))))
+ continue;
+
+ VPValue *AnyOfRdx;
+ [[maybe_unused]] bool Matched =
+ match(FindIVResult->getOperand(2),
+ m_Freeze(m_AnyOf(m_Or(m_VPValue(AnyOfRdx), m_VPValue()))));
+ assert(Matched && "expected Freeze(AnyOf(Or(...)))");
+
+ // The non-phi operand of the select is the IV.
+ VPValue *IV = (TrueVal == PhiR) ? FalseVal : TrueVal;
+
+ const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(IV, PSE, &L);
+ if (isa<SCEVCouldNotCompute>(IVSCEV))
+ continue;
+
+ Type *IVTy = TypeInfo.inferScalarType(PhiR);
+ unsigned BW = IVTy->getIntegerBitWidth();
+ auto GetSentinel = [&](bool Signed) -> APInt {
+ return IsFindLast ? (Signed ? APInt::getSignedMinValue(BW)
+ : APInt::getMinValue(BW))
+ : (Signed ? APInt::getSignedMaxValue(BW)
+ : APInt::getMaxValue(BW));
+ };
+ auto CheckSentinel = [&](bool Signed) -> bool {
+ APInt Sentinel = GetSentinel(Signed);
+ ConstantRange IVRange =
+ Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
+ return !IVRange.contains(Sentinel);
+ };
+
+ // Try preferred signedness first, then try the opposite.
+ bool UseSigned = IsSigned;
+ if (!CheckSentinel(IsSigned)) {
+ if (!CheckSentinel(!IsSigned))
+ continue;
+ UseSigned = !IsSigned;
+ }
+ APInt SentinelVal = GetSentinel(UseSigned);
+ VPValue *Sentinel =
+ Plan.getOrAddLiveIn(ConstantInt::get(IVTy, SentinelVal));
+
+ // If we're using a different signedness than the original RecurKind,
+ // create a new VPReductionPHIRecipe with the correct RecurKind.
+ if (UseSigned != IsSigned) {
+ RecurKind NewRK;
+ if (IsFindLast)
+ NewRK =
+ UseSigned ? RecurKind::FindLastIVSMax : RecurKind::FindLastIVUMax;
+ else
+ NewRK =
+ UseSigned ? RecurKind::FindFirstIVSMin : RecurKind::FindFirstIVUMin;
+ auto *NewPhiR = new VPReductionPHIRecipe(
+ nullptr, NewRK, *Sentinel, *PhiR->getBackedgeValue(), RdxUnordered{1},
+ PhiR->hasUsesOutsideReductionChain());
+ NewPhiR->insertBefore(PhiR);
+ PhiR->replaceAllUsesWith(NewPhiR);
+ }
+ FindIVResult->setOperand(2, Sentinel);
+
+ auto *AnyOfRdxPhiR = cast<VPReductionPHIRecipe>(AnyOfRdx);
+ AnyOfRdxPhiR->replaceAllUsesWith(AnyOfRdxPhiR->getOperand(0));
+ AnyOfRdxPhiR->eraseFromParent();
+ MadeChanges = true;
+ }
+
+ if (MadeChanges)
+ VPlanTransforms::removeDeadRecipes(Plan);
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index f4ecee4e7f04f..a54e4f6e8e204 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -426,6 +426,12 @@ struct VPlanTransforms {
/// users in the original exit block using the VPIRInstruction wrapping to the
/// LCSSA phi.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);
+
+ /// Optimize FindFirstIV/FindLastIV reductions by replacing boolean tracking
+ /// with sentinel-based comparison when the IV range excludes the sentinel.
+ /// This removes the boolean phi and OR chain, simplifying the computation.
+ static void optimizeFindIVReductions(VPlan &Plan,
+ PredicatedScalarEvolution &PSE, Loop &L);
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index a5798e1672016..56d7986f7c2e5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -371,14 +371,22 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>(
m_VPValue(), m_VPValue(), m_VPValue(Op1))) ||
match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
- m_VPValue(), m_VPValue(Op1))) ||
- match(&R, m_VPInstruction<VPInstruction::ComputeFindIVResult>(
- m_VPValue(), m_VPValue(), m_VPValue(), m_VPValue(Op1)))) {
+ m_VPValue(), m_VPValue(Op1)))) {
addUniformForAllParts(cast<VPInstruction>(&R));
for (unsigned Part = 1; Part != UF; ++Part)
R.addOperand(getValueForPart(Op1, Part));
continue;
}
+ // ComputeFindIVResult has format [PhiR, Start, CondOrSentinel, IV0, IV1,
+ // ...], with IV starting at operand 3.
+ if (auto *VPI = dyn_cast<VPInstruction>(&R);
+ VPI && VPI->getOpcode() == VPInstruction::ComputeFindIVResult) {
+ addUniformForAllParts(VPI);
+ VPValue *IVOp = VPI->getOperand(3);
+ for (unsigned Part = 1; Part != UF; ++Part)
+ R.addOperand(getValueForPart(IVOp, Part));
+ continue;
+ }
VPValue *Op0;
if (match(&R, m_ExtractLane(m_VPValue(Op0), m_VPValue(Op1)))) {
addUniformForAllParts(cast<VPInstruction>(&R));
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
index 580c568c373f1..66ca6fb18254e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
@@ -241,6 +241,44 @@ exit:
ret i32 %red.next
}
+define i8 @select_icmp_var_start_iv_may_wrap(ptr %a, i8 %n, i8 %start, i8 %iv.start) {
+; CHECK-LABEL: define i8 @select_icmp_var_start_iv_may_wrap(
+; CHECK-SAME: ptr [[A:%.*]], i8 [[N:%.*]], i8 [[START:%.*]], i8 [[IV_START:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[RDX:%.*]] = phi i8 [ [[START]], %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV]]
+; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP]], align 8
+; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[L]], 3
+; CHECK-NEXT: [[SEL]] = select i1 [[C]], i8 [[IV]], i8 [[RDX]]
+; CHECK-NEXT: [[IV_NEXT]] = add i8 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i8 [ [[SEL]], %[[LOOP]] ]
+; CHECK-NEXT: ret i8 [[SEL_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop ]
+ %rdx = phi i8 [ %start, %entry ], [ %sel, %loop ]
+ %gep = getelementptr inbounds i8, ptr %a, i8 %iv
+ %l = load i8, ptr %gep, align 8
+ %c = icmp eq i8 %l, 3
+ %sel = select i1 %c, i8 %iv, i8 %rdx
+ %iv.next = add i8 %iv, 1
+ %ec = icmp eq i8 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret i8 %sel
+}
+
+
declare void @llvm.assume(i1 noundef)
attributes #0 = { "target-cpu"="apple-m1" }
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
index 75f0017c86c9a..916ff27400a78 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck --check-prefixes=CHECK,IC1VF4 %s
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck --check-prefixes=CHECK,IC4VF4 %s
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck --check-prefixes=CHECK,IC4VF1 %s
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck --check-prefixes=IC1VF4 %s
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck --check-prefixes=IC4VF4 %s
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck --check-prefixes=IC4VF1 %s
define i64 @select_decreasing_induction_icmp_const_start(ptr %a) {
; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_const_start(
@@ -960,30 +960,225 @@ exit: ; preds = %loop
ret i64 %spec.select
}
-; The unsigned sentinel value for decreasing-IV vectorization is ULONG_MAX,
-; and since the IV hits this value, it is impossible to vectorize this case.
-; In this test, %iv's range will include both signed and unsigned
-; maximum (sentinel) values.
-define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start) {
-; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1
-; CHECK-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]]
-; CHECK-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1
-; CHECK-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]]
-; CHECK-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1
-; CHECK-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]]
-; CHECK-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
-; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0
-; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
-; CHECK-NEXT: ret i64 [[COND_LCSSA]]
+; The IV range spans both signed and unsigned max values, but vectorization is
+; possible using the unsigned FindFirstIV (umin) approach because the sentinel
+; value (ULONG_MAX/-1) is not in the actual IV range used by the select.
+define i64 @select_decreasing_induction_find_first_iv_umin(ptr %a, ptr %b, i64 %rdx.start) {
+; IC1VF4-LABEL: define i64 @select_decreasing_induction_find_first_iv_umin(
+; IC1VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) {
+; IC1VF4-NEXT: [[ENTRY:.*:]]
+; IC1VF4-NEXT: br label %[[VECTOR_PH:.*]]
+; IC1VF4: [[VECTOR_PH]]:
+; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC1VF4: [[VECTOR_BODY]]:
+; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -1, i64 -2, i64 -3, i64 -4>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
+; IC1VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; IC1VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
+; IC1VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+; IC1VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 -3
+; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 0
+; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 -3
+; IC1VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
+; IC1VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC1VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]]
+; IC1VF4-NEXT: [[TMP9]] = select <4 x i1> [[TMP8]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]]
+; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC1VF4-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4
+; IC1VF4-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IC1VF4: [[MIDDLE_BLOCK]]:
+; IC1VF4-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[TMP9]])
+; IC1VF4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP11]], -1
+; IC1VF4-NEXT: [[TMP12:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP11]], i64 [[RDX_START]]
+; IC1VF4-NEXT: br label %[[SCALAR_PH:.*]]
+; IC1VF4: [[SCALAR_PH]]:
+; IC1VF4-NEXT: br label %[[LOOP:.*]]
+; IC1VF4: [[LOOP]]:
+; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[TMP12]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ]
+; IC1VF4-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1
+; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]]
+; IC1VF4-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1
+; IC1VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]]
+; IC1VF4-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1
+; IC1VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]]
+; IC1VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; IC1VF4: [[EXIT]]:
+; IC1VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
+; IC1VF4-NEXT: ret i64 [[COND_LCSSA]]
+;
+; IC4VF4-LABEL: define i64 @select_decreasing_induction_find_first_iv_umin(
+; IC4VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) {
+; IC4VF4-NEXT: [[ENTRY:.*:]]
+; IC4VF4-NEXT: br label %[[VECTOR_PH:.*]]
+; IC4VF4: [[VECTOR_PH]]:
+; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC4VF4: [[VECTOR_BODY]]:
+; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -1, i64 -2, i64 -3, i64 -4>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP28:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP30:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC4VF4-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 -4)
+; IC4VF4-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 -4)
+; IC4VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
+; IC4VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; IC4VF4-NEXT: [[TMP2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 -1)
+; IC4VF4-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 -1)
+; IC4VF4-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 -1)
+; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
+; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 0
+; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 -3
+; IC4VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -4
+; IC4VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 -3
+; IC4VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -8
+; IC4VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 -3
+; IC4VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -12
+; IC4VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 -3
+; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
+; IC4VF4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; IC4VF4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
+; IC4VF4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
+; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD4]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD6]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD8]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; IC4VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 0
+; IC4VF4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 -3
+; IC4VF4-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
+; IC4VF4-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 -3
+; IC4VF4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -8
+; IC4VF4-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i64 -3
+; IC4VF4-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -12
+; IC4VF4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i64 -3
+; IC4VF4-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i8>, ptr [[TMP16]], align 1
+; IC4VF4-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
+; IC4VF4-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1
+; IC4VF4-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
+; IC4VF4-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD10]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD12]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD14]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD16]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[TMP23:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE11]]
+; IC4VF4-NEXT: [[TMP24:%.*]] = icmp sgt <4 x i8> [[REVERSE5]], [[REVERSE13]]
+; IC4VF4-NEXT: [[TMP25:%.*]] = icmp sgt <4 x i8> [[REVERSE7]], [[REVERSE15]]
+; IC4VF4-NEXT: [[TMP26:%.*]] = icmp sgt <4 x i8> [[REVERSE9]], [[REVERSE17]]
+; IC4VF4-NEXT: [[TMP27]] = select <4 x i1> [[TMP23]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]]
+; IC4VF4-NEXT: [[TMP28]] = select <4 x i1> [[TMP24]], <4 x i64> [[TMP2]], <4 x i64> [[VEC_PHI1]]
+; IC4VF4-NEXT: [[TMP29]] = select <4 x i1> [[TMP25]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI2]]
+; IC4VF4-NEXT: [[TMP30]] = select <4 x i1> [[TMP26]], <4 x i64> [[TMP4]], <4 x i64> [[VEC_PHI3]]
+; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 -4)
+; IC4VF4-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], -16
+; IC4VF4-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IC4VF4: [[MIDDLE_BLOCK]]:
+; IC4VF4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP27]], <4 x i64> [[TMP28]])
+; IC4VF4-NEXT: [[RDX_MINMAX18:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP29]])
+; IC4VF4-NEXT: [[RDX_MINMAX19:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[RDX_MINMAX18]], <4 x i64> [[TMP30]])
+; IC4VF4-NEXT: [[TMP32:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[RDX_MINMAX19]])
+; IC4VF4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP32]], -1
+; IC4VF4-NEXT: [[TMP33:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP32]], i64 [[RDX_START]]
+; IC4VF4-NEXT: br label %[[SCALAR_PH:.*]]
+; IC4VF4: [[SCALAR_PH]]:
+; IC4VF4-NEXT: br label %[[LOOP:.*]]
+; IC4VF4: [[LOOP]]:
+; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ 15, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[TMP33]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ]
+; IC4VF4-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1
+; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]]
+; IC4VF4-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1
+; IC4VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]]
+; IC4VF4-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1
+; IC4VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]]
+; IC4VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; IC4VF4: [[EXIT]]:
+; IC4VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
+; IC4VF4-NEXT: ret i64 [[COND_LCSSA]]
+;
+; IC4VF1-LABEL: define i64 @select_decreasing_induction_find_first_iv_umin(
+; IC4VF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) {
+; IC4VF1-NEXT: [[ENTRY:.*:]]
+; IC4VF1-NEXT: br label %[[VECTOR_PH:.*]]
+; IC4VF1: [[VECTOR_PH]]:
+; IC4VF1-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC4VF1: [[VECTOR_BODY]]:
+; IC4VF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[VEC_PHI:%.*]] = phi i64 [ -1, %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ -1, %[[VECTOR_PH]] ], [ [[TMP28:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ -1, %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ -1, %[[VECTOR_PH]] ], [ [[TMP30:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[OFFSET_IDX:%.*]] = sub i64 -1, [[INDEX]]
+; IC4VF1-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], -1
+; IC4VF1-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -2
+; IC4VF1-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -3
+; IC4VF1-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -1
+; IC4VF1-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], -1
+; IC4VF1-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], -1
+; IC4VF1-NEXT: [[TMP6:%.*]] = add i64 [[TMP2]], -1
+; IC4VF1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; IC4VF1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
+; IC4VF1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
+; IC4VF1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; IC4VF1-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP7]], align 1
+; IC4VF1-NEXT: [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1
+; IC4VF1-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP9]], align 1
+; IC4VF1-NEXT: [[TMP14:%.*]] = load i8, ptr [[TMP10]], align 1
+; IC4VF1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; IC4VF1-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
+; IC4VF1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; IC4VF1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; IC4VF1-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP15]], align 1
+; IC4VF1-NEXT: [[TMP20:%.*]] = load i8, ptr [[TMP16]], align 1
+; IC4VF1-NEXT: [[TMP21:%.*]] = load i8, ptr [[TMP17]], align 1
+; IC4VF1-NEXT: [[TMP22:%.*]] = load i8, ptr [[TMP18]], align 1
+; IC4VF1-NEXT: [[TMP23:%.*]] = icmp sgt i8 [[TMP11]], [[TMP19]]
+; IC4VF1-NEXT: [[TMP24:%.*]] = icmp sgt i8 [[TMP12]], [[TMP20]]
+; IC4VF1-NEXT: [[TMP25:%.*]] = icmp sgt i8 [[TMP13]], [[TMP21]]
+; IC4VF1-NEXT: [[TMP26:%.*]] = icmp sgt i8 [[TMP14]], [[TMP22]]
+; IC4VF1-NEXT: [[TMP27]] = select i1 [[TMP23]], i64 [[TMP3]], i64 [[VEC_PHI]]
+; IC4VF1-NEXT: [[TMP28]] = select i1 [[TMP24]], i64 [[TMP4]], i64 [[VEC_PHI1]]
+; IC4VF1-NEXT: [[TMP29]] = select i1 [[TMP25]], i64 [[TMP5]], i64 [[VEC_PHI2]]
+; IC4VF1-NEXT: [[TMP30]] = select i1 [[TMP26]], i64 [[TMP6]], i64 [[VEC_PHI3]]
+; IC4VF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC4VF1-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4
+; IC4VF1-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IC4VF1: [[MIDDLE_BLOCK]]:
+; IC4VF1-NEXT: [[RDX_MINMAX:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP27]], i64 [[TMP28]])
+; IC4VF1-NEXT: [[RDX_MINMAX4:%.*]] = call i64 @llvm.umin.i64(i64 [[RDX_MINMAX]], i64 [[TMP29]])
+; IC4VF1-NEXT: [[RDX_MINMAX5:%.*]] = call i64 @llvm.umin.i64(i64 [[RDX_MINMAX4]], i64 [[TMP30]])
+; IC4VF1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], -1
+; IC4VF1-NEXT: [[TMP32:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 [[RDX_START]]
+; IC4VF1-NEXT: br label %[[SCALAR_PH:.*]]
+; IC4VF1: [[SCALAR_PH]]:
+; IC4VF1-NEXT: br label %[[LOOP:.*]]
+; IC4VF1: [[LOOP]]:
+; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[TMP32]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1
+; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]]
+; IC4VF1-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1
+; IC4VF1-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]]
+; IC4VF1-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1
+; IC4VF1-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]]
+; IC4VF1-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; IC4VF1: [[EXIT]]:
+; IC4VF1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
+; IC4VF1-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
br label %loop
@@ -1005,26 +1200,295 @@ exit:
ret i64 %cond
}
-define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
-; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ]
-; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[RDX_START]], %[[ENTRY]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
-; CHECK-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]]
-; CHECK-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
-; CHECK-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]]
-; CHECK-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8
-; CHECK-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]]
-; CHECK-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
-; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1
-; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT:.*]]
-; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
-; CHECK-NEXT: ret i64 [[COND_LCSSA]]
+; Vectorized using the AnyOf pattern: the loop tracks whether any condition was
+; true, allowing vectorization without requiring a sentinel value.
+define i64 @select_decreasing_induction_find_first_iv_any_of(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
+; IC1VF4-LABEL: define i64 @select_decreasing_induction_find_first_iv_any_of(
+; IC1VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; IC1VF4-NEXT: [[ENTRY:.*]]:
+; IC1VF4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
+; IC1VF4-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1)
+; IC1VF4-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]]
+; IC1VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4
+; IC1VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC1VF4: [[VECTOR_PH]]:
+; IC1VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
+; IC1VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; IC1VF4-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]]
+; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; IC1VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IC1VF4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 -1, i64 -2, i64 -3>
+; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC1VF4: [[VECTOR_BODY]]:
+; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[ANY_FOUND:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1)
+; IC1VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 0
+; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i64 -3
+; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC1VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; IC1VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 0
+; IC1VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 -3
+; IC1VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
+; IC1VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC1VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]]
+; IC1VF4-NEXT: [[TMP12]] = or <4 x i1> [[ANY_FOUND]], [[TMP11]]
+; IC1VF4-NEXT: [[TMP13]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
+; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC1VF4-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC1VF4-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IC1VF4: [[MIDDLE_BLOCK]]:
+; IC1VF4-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP12]]
+; IC1VF4-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
+; IC1VF4-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP16]]
+; IC1VF4-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP13]])
+; IC1VF4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[ANY_FOUND_FR]], i64 [[TMP15]], i64 [[RDX_START]]
+; IC1VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; IC1VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC1VF4: [[SCALAR_PH]]:
+; IC1VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ]
+; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; IC1VF4-NEXT: br label %[[LOOP:.*]]
+; IC1VF4: [[LOOP]]:
+; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; IC1VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]]
+; IC1VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
+; IC1VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]]
+; IC1VF4-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8
+; IC1VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]]
+; IC1VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1
+; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; IC1VF4: [[EXIT]]:
+; IC1VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; IC1VF4-NEXT: ret i64 [[COND_LCSSA]]
+;
+; IC4VF4-LABEL: define i64 @select_decreasing_induction_find_first_iv_any_of(
+; IC4VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; IC4VF4-NEXT: [[ENTRY:.*]]:
+; IC4VF4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
+; IC4VF4-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1)
+; IC4VF4-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]]
+; IC4VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
+; IC4VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC4VF4: [[VECTOR_PH]]:
+; IC4VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16
+; IC4VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; IC4VF4-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]]
+; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; IC4VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IC4VF4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 -1, i64 -2, i64 -3>
+; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC4VF4: [[VECTOR_BODY]]:
+; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP34:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP35:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP36:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP37:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[ANY_FOUND:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP30:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[ANY_FOUND6:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP31:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[ANY_FOUND7:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP32:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[ANY_FOUND8:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP33:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC4VF4-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 -4)
+; IC4VF4-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 -4)
+; IC4VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1)
+; IC4VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; IC4VF4-NEXT: [[TMP5:%.*]] = add nsw <4 x i64> [[STEP_ADD]], splat (i64 -1)
+; IC4VF4-NEXT: [[TMP6:%.*]] = add nsw <4 x i64> [[STEP_ADD_2]], splat (i64 -1)
+; IC4VF4-NEXT: [[TMP7:%.*]] = add nsw <4 x i64> [[STEP_ADD_3]], splat (i64 -1)
+; IC4VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; IC4VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 0
+; IC4VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 -3
+; IC4VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 -4
+; IC4VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i64 -3
+; IC4VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 -8
+; IC4VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i64 -3
+; IC4VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 -12
+; IC4VF4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i64 -3
+; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
+; IC4VF4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
+; IC4VF4-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i64>, ptr [[TMP14]], align 8
+; IC4VF4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i64>, ptr [[TMP16]], align 8
+; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD9]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD11]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD13]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; IC4VF4-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP17]], i64 0
+; IC4VF4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[TMP18]], i64 -3
+; IC4VF4-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP17]], i64 -4
+; IC4VF4-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i64 -3
+; IC4VF4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[TMP17]], i64 -8
+; IC4VF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[TMP22]], i64 -3
+; IC4VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[TMP17]], i64 -12
+; IC4VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i64 -3
+; IC4VF4-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x i64>, ptr [[TMP19]], align 8
+; IC4VF4-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x i64>, ptr [[TMP21]], align 8
+; IC4VF4-NEXT: [[WIDE_LOAD19:%.*]] = load <4 x i64>, ptr [[TMP23]], align 8
+; IC4VF4-NEXT: [[WIDE_LOAD21:%.*]] = load <4 x i64>, ptr [[TMP25]], align 8
+; IC4VF4-NEXT: [[REVERSE16:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD15]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE18:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD17]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD19]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD21]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[TMP26:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE16]]
+; IC4VF4-NEXT: [[TMP27:%.*]] = icmp sgt <4 x i64> [[REVERSE10]], [[REVERSE18]]
+; IC4VF4-NEXT: [[TMP28:%.*]] = icmp sgt <4 x i64> [[REVERSE12]], [[REVERSE20]]
+; IC4VF4-NEXT: [[TMP29:%.*]] = icmp sgt <4 x i64> [[REVERSE14]], [[REVERSE22]]
+; IC4VF4-NEXT: [[TMP30]] = or <4 x i1> [[ANY_FOUND]], [[TMP26]]
+; IC4VF4-NEXT: [[TMP31]] = or <4 x i1> [[ANY_FOUND6]], [[TMP27]]
+; IC4VF4-NEXT: [[TMP32]] = or <4 x i1> [[ANY_FOUND7]], [[TMP28]]
+; IC4VF4-NEXT: [[TMP33]] = or <4 x i1> [[ANY_FOUND8]], [[TMP29]]
+; IC4VF4-NEXT: [[TMP34]] = select <4 x i1> [[TMP26]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
+; IC4VF4-NEXT: [[TMP35]] = select <4 x i1> [[TMP27]], <4 x i64> [[TMP5]], <4 x i64> [[VEC_PHI3]]
+; IC4VF4-NEXT: [[TMP36]] = select <4 x i1> [[TMP28]], <4 x i64> [[TMP6]], <4 x i64> [[VEC_PHI4]]
+; IC4VF4-NEXT: [[TMP37]] = select <4 x i1> [[TMP29]], <4 x i64> [[TMP7]], <4 x i64> [[VEC_PHI5]]
+; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[STEP_ADD_3]], splat (i64 -4)
+; IC4VF4-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC4VF4-NEXT: br i1 [[TMP38]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IC4VF4: [[MIDDLE_BLOCK]]:
+; IC4VF4-NEXT: [[TMP45:%.*]] = freeze <4 x i1> [[TMP30]]
+; IC4VF4-NEXT: [[TMP46:%.*]] = freeze <4 x i1> [[TMP31]]
+; IC4VF4-NEXT: [[TMP41:%.*]] = or <4 x i1> [[TMP45]], [[TMP46]]
+; IC4VF4-NEXT: [[TMP42:%.*]] = freeze <4 x i1> [[TMP32]]
+; IC4VF4-NEXT: [[TMP43:%.*]] = or <4 x i1> [[TMP41]], [[TMP42]]
+; IC4VF4-NEXT: [[TMP44:%.*]] = freeze <4 x i1> [[TMP33]]
+; IC4VF4-NEXT: [[ANY_FOUND_OR29:%.*]] = or <4 x i1> [[TMP43]], [[TMP44]]
+; IC4VF4-NEXT: [[TMP40:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[ANY_FOUND_OR29]])
+; IC4VF4-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP40]]
+; IC4VF4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP34]], <4 x i64> [[TMP35]])
+; IC4VF4-NEXT: [[RDX_MINMAX22:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP36]])
+; IC4VF4-NEXT: [[RDX_MINMAX23:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[RDX_MINMAX22]], <4 x i64> [[TMP37]])
+; IC4VF4-NEXT: [[TMP39:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[RDX_MINMAX23]])
+; IC4VF4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[ANY_FOUND_FR]], i64 [[TMP39]], i64 [[RDX_START]]
+; IC4VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; IC4VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC4VF4: [[SCALAR_PH]]:
+; IC4VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ]
+; IC4VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; IC4VF4-NEXT: br label %[[LOOP:.*]]
+; IC4VF4: [[LOOP]]:
+; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; IC4VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]]
+; IC4VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
+; IC4VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]]
+; IC4VF4-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8
+; IC4VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]]
+; IC4VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1
+; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; IC4VF4: [[EXIT]]:
+; IC4VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; IC4VF4-NEXT: ret i64 [[COND_LCSSA]]
+;
+; IC4VF1-LABEL: define i64 @select_decreasing_induction_find_first_iv_any_of(
+; IC4VF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; IC4VF1-NEXT: [[ENTRY:.*]]:
+; IC4VF1-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
+; IC4VF1-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1)
+; IC4VF1-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]]
+; IC4VF1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4
+; IC4VF1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC4VF1: [[VECTOR_PH]]:
+; IC4VF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
+; IC4VF1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; IC4VF1-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]]
+; IC4VF1-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC4VF1: [[VECTOR_BODY]]:
+; IC4VF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 9223372036854775807, %[[VECTOR_PH]] ], [ [[TMP34:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ 9223372036854775807, %[[VECTOR_PH]] ], [ [[TMP35:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ 9223372036854775807, %[[VECTOR_PH]] ], [ [[TMP36:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ 9223372036854775807, %[[VECTOR_PH]] ], [ [[TMP37:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[ANY_FOUND:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP30:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[ANY_FOUND4:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP31:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[ANY_FOUND5:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP32:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[ANY_FOUND6:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP33:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[INDEX]]
+; IC4VF1-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -1
+; IC4VF1-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], -2
+; IC4VF1-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], -3
+; IC4VF1-NEXT: [[TMP6:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; IC4VF1-NEXT: [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1
+; IC4VF1-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP4]], -1
+; IC4VF1-NEXT: [[TMP9:%.*]] = add nsw i64 [[TMP5]], -1
+; IC4VF1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; IC4VF1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; IC4VF1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; IC4VF1-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; IC4VF1-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; IC4VF1-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; IC4VF1-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; IC4VF1-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; IC4VF1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP6]]
+; IC4VF1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP7]]
+; IC4VF1-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP8]]
+; IC4VF1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP9]]
+; IC4VF1-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP18]], align 8
+; IC4VF1-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP19]], align 8
+; IC4VF1-NEXT: [[TMP24:%.*]] = load i64, ptr [[TMP20]], align 8
+; IC4VF1-NEXT: [[TMP25:%.*]] = load i64, ptr [[TMP21]], align 8
+; IC4VF1-NEXT: [[TMP26:%.*]] = icmp sgt i64 [[TMP14]], [[TMP22]]
+; IC4VF1-NEXT: [[TMP27:%.*]] = icmp sgt i64 [[TMP15]], [[TMP23]]
+; IC4VF1-NEXT: [[TMP28:%.*]] = icmp sgt i64 [[TMP16]], [[TMP24]]
+; IC4VF1-NEXT: [[TMP29:%.*]] = icmp sgt i64 [[TMP17]], [[TMP25]]
+; IC4VF1-NEXT: [[TMP30]] = or i1 [[ANY_FOUND]], [[TMP26]]
+; IC4VF1-NEXT: [[TMP31]] = or i1 [[ANY_FOUND4]], [[TMP27]]
+; IC4VF1-NEXT: [[TMP32]] = or i1 [[ANY_FOUND5]], [[TMP28]]
+; IC4VF1-NEXT: [[TMP33]] = or i1 [[ANY_FOUND6]], [[TMP29]]
+; IC4VF1-NEXT: [[TMP34]] = select i1 [[TMP26]], i64 [[TMP6]], i64 [[VEC_PHI]]
+; IC4VF1-NEXT: [[TMP35]] = select i1 [[TMP27]], i64 [[TMP7]], i64 [[VEC_PHI1]]
+; IC4VF1-NEXT: [[TMP36]] = select i1 [[TMP28]], i64 [[TMP8]], i64 [[VEC_PHI2]]
+; IC4VF1-NEXT: [[TMP37]] = select i1 [[TMP29]], i64 [[TMP9]], i64 [[VEC_PHI3]]
+; IC4VF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC4VF1-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC4VF1-NEXT: br i1 [[TMP38]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IC4VF1: [[MIDDLE_BLOCK]]:
+; IC4VF1-NEXT: [[TMP39:%.*]] = freeze i1 [[TMP30]]
+; IC4VF1-NEXT: [[TMP40:%.*]] = freeze i1 [[TMP31]]
+; IC4VF1-NEXT: [[TMP41:%.*]] = or i1 [[TMP39]], [[TMP40]]
+; IC4VF1-NEXT: [[TMP42:%.*]] = freeze i1 [[TMP32]]
+; IC4VF1-NEXT: [[TMP43:%.*]] = or i1 [[TMP41]], [[TMP42]]
+; IC4VF1-NEXT: [[TMP44:%.*]] = freeze i1 [[TMP33]]
+; IC4VF1-NEXT: [[ANY_FOUND_OR13:%.*]] = or i1 [[TMP43]], [[TMP44]]
+; IC4VF1-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[ANY_FOUND_OR13]]
+; IC4VF1-NEXT: [[RDX_MINMAX:%.*]] = call i64 @llvm.smin.i64(i64 [[TMP34]], i64 [[TMP35]])
+; IC4VF1-NEXT: [[RDX_MINMAX8:%.*]] = call i64 @llvm.smin.i64(i64 [[RDX_MINMAX]], i64 [[TMP36]])
+; IC4VF1-NEXT: [[RDX_MINMAX12:%.*]] = call i64 @llvm.smin.i64(i64 [[RDX_MINMAX8]], i64 [[TMP37]])
+; IC4VF1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[ANY_FOUND_FR]], i64 [[RDX_MINMAX12]], i64 [[RDX_START]]
+; IC4VF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; IC4VF1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC4VF1: [[SCALAR_PH]]:
+; IC4VF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ]
+; IC4VF1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; IC4VF1-NEXT: br label %[[LOOP:.*]]
+; IC4VF1: [[LOOP]]:
+; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; IC4VF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]]
+; IC4VF1-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
+; IC4VF1-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]]
+; IC4VF1-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8
+; IC4VF1-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]]
+; IC4VF1-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1
+; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; IC4VF1: [[EXIT]]:
+; IC4VF1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; IC4VF1-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
index 21ef1885b75b9..f1d6967167ca4 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
@@ -145,10 +145,43 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) {
; CHECK-LABEL: define i64 @select_icmp_nuw(
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ANY_FOUND:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP3]] = or <4 x i1> [[ANY_FOUND]], [[TMP2]]
+; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP10:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
+; CHECK-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP7]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[ANY_FOUND_FR]], i64 [[TMP6]], i64 [[II]]
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ]
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[II]], %[[ENTRY]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
@@ -157,9 +190,9 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) {
; CHECK-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
; CHECK-NEXT: [[INC]] = add nuw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
@@ -229,4 +262,6 @@ exit: ; preds = %for.body
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
index 72ed6537ef640..7463090fcb031 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
@@ -1,32 +1,250 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefixes=CHECK,VF4IC1
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefixes=CHECK,VF4IC4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefixes=CHECK,VF1IC4
define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %iv_start ,i64 %n) {
-; CHECK-LABEL: define i64 @select_non_const_iv_start_signed_guard(
-; CHECK-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]]
-; CHECK-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
-; CHECK: [[FOR_BODY_PREHEADER]]:
-; CHECK-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3
-; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
-; CHECK: [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT: br label %[[EXIT]]
-; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
-; CHECK-NEXT: ret i64 [[IDX_0_LCSSA]]
+; VF4IC1-LABEL: define i64 @select_non_const_iv_start_signed_guard(
+; VF4IC1-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) {
+; VF4IC1-NEXT: [[ENTRY:.*]]:
+; VF4IC1-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]]
+; VF4IC1-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; VF4IC1: [[FOR_BODY_PREHEADER]]:
+; VF4IC1-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_START]]
+; VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4IC1: [[VECTOR_PH]]:
+; VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; VF4IC1-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]]
+; VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0
+; VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; VF4IC1-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF4IC1: [[VECTOR_BODY]]:
+; VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]]
+; VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3)
+; VF4IC1-NEXT: [[TMP4]] = or <4 x i1> [[VEC_PHI1]], [[TMP3]]
+; VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4IC1: [[MIDDLE_BLOCK]]:
+; VF4IC1-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP4]]
+; VF4IC1-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; VF4IC1-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP8]]
+; VF4IC1-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP5]])
+; VF4IC1-NEXT: [[TMP10:%.*]] = select i1 [[ANY_FOUND_FR]], i64 [[TMP9]], i64 [[RDX_START]]
+; VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; VF4IC1: [[SCALAR_PH]]:
+; VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ]
+; VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
+; VF4IC1: [[FOR_BODY]]:
+; VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; VF4IC1-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; VF4IC1-NEXT: [[TMP11:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
+; VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP11]], 3
+; VF4IC1-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]]
+; VF4IC1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF4IC1: [[EXIT_LOOPEXIT]]:
+; VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; VF4IC1-NEXT: br label %[[EXIT]]
+; VF4IC1: [[EXIT]]:
+; VF4IC1-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; VF4IC1-NEXT: ret i64 [[IDX_0_LCSSA]]
+;
+; VF4IC4-LABEL: define i64 @select_non_const_iv_start_signed_guard(
+; VF4IC4-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) {
+; VF4IC4-NEXT: [[ENTRY:.*]]:
+; VF4IC4-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]]
+; VF4IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; VF4IC4: [[FOR_BODY_PREHEADER]]:
+; VF4IC4-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_START]]
+; VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4IC4: [[VECTOR_PH]]:
+; VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
+; VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]]
+; VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0
+; VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; VF4IC4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF4IC4: [[VECTOR_BODY]]:
+; VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; VF4IC4-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; VF4IC4-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
+; VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]]
+; VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 4
+; VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 8
+; VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 12
+; VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; VF4IC4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP3]], align 4
+; VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP4]], align 4
+; VF4IC4-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i64>, ptr [[TMP5]], align 4
+; VF4IC4-NEXT: [[TMP6:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3)
+; VF4IC4-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD8]], splat (i64 3)
+; VF4IC4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD9]], splat (i64 3)
+; VF4IC4-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD10]], splat (i64 3)
+; VF4IC4-NEXT: [[TMP10]] = or <4 x i1> [[VEC_PHI4]], [[TMP6]]
+; VF4IC4-NEXT: [[TMP11]] = or <4 x i1> [[VEC_PHI5]], [[TMP7]]
+; VF4IC4-NEXT: [[TMP12]] = or <4 x i1> [[VEC_PHI6]], [[TMP8]]
+; VF4IC4-NEXT: [[TMP13]] = or <4 x i1> [[VEC_PHI7]], [[TMP9]]
+; VF4IC4-NEXT: [[TMP14]] = select <4 x i1> [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; VF4IC4-NEXT: [[TMP15]] = select <4 x i1> [[TMP7]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; VF4IC4-NEXT: [[TMP16]] = select <4 x i1> [[TMP8]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; VF4IC4-NEXT: [[TMP17]] = select <4 x i1> [[TMP9]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
+; VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; VF4IC4-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4IC4-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4IC4: [[MIDDLE_BLOCK]]:
+; VF4IC4-NEXT: [[TMP19:%.*]] = freeze <4 x i1> [[TMP10]]
+; VF4IC4-NEXT: [[TMP20:%.*]] = freeze <4 x i1> [[TMP11]]
+; VF4IC4-NEXT: [[TMP21:%.*]] = or <4 x i1> [[TMP19]], [[TMP20]]
+; VF4IC4-NEXT: [[TMP22:%.*]] = freeze <4 x i1> [[TMP12]]
+; VF4IC4-NEXT: [[TMP23:%.*]] = or <4 x i1> [[TMP21]], [[TMP22]]
+; VF4IC4-NEXT: [[TMP24:%.*]] = freeze <4 x i1> [[TMP13]]
+; VF4IC4-NEXT: [[TMP25:%.*]] = or <4 x i1> [[TMP23]], [[TMP24]]
+; VF4IC4-NEXT: [[TMP26:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP25]])
+; VF4IC4-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP26]]
+; VF4IC4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP14]], <4 x i64> [[TMP15]])
+; VF4IC4-NEXT: [[RDX_MINMAX11:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP16]])
+; VF4IC4-NEXT: [[RDX_MINMAX12:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX11]], <4 x i64> [[TMP17]])
+; VF4IC4-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX12]])
+; VF4IC4-NEXT: [[TMP28:%.*]] = select i1 [[ANY_FOUND_FR]], i64 [[TMP27]], i64 [[RDX_START]]
+; VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; VF4IC4: [[SCALAR_PH]]:
+; VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ]
+; VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
+; VF4IC4: [[FOR_BODY]]:
+; VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; VF4IC4-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; VF4IC4-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
+; VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP29]], 3
+; VF4IC4-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]]
+; VF4IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF4IC4: [[EXIT_LOOPEXIT]]:
+; VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ]
+; VF4IC4-NEXT: br label %[[EXIT]]
+; VF4IC4: [[EXIT]]:
+; VF4IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; VF4IC4-NEXT: ret i64 [[IDX_0_LCSSA]]
+;
+; VF1IC4-LABEL: define i64 @select_non_const_iv_start_signed_guard(
+; VF1IC4-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) {
+; VF1IC4-NEXT: [[ENTRY:.*]]:
+; VF1IC4-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]]
+; VF1IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; VF1IC4: [[FOR_BODY_PREHEADER]]:
+; VF1IC4-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_START]]
+; VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF1IC4: [[VECTOR_PH]]:
+; VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; VF1IC4-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]]
+; VF1IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF1IC4: [[VECTOR_BODY]]:
+; VF1IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF1IC4-NEXT: [[VEC_PHI:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
+; VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
+; VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[VECTOR_BODY]] ]
+; VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; VF1IC4-NEXT: [[VEC_PHI5:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; VF1IC4-NEXT: [[VEC_PHI6:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; VF1IC4-NEXT: [[VEC_PHI7:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; VF1IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]]
+; VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 1
+; VF1IC4-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 2
+; VF1IC4-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 3
+; VF1IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; VF1IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; VF1IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF1IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF1IC4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 4
+; VF1IC4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 4
+; VF1IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 4
+; VF1IC4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 4
+; VF1IC4-NEXT: [[TMP13:%.*]] = icmp sgt i64 [[TMP9]], 3
+; VF1IC4-NEXT: [[TMP14:%.*]] = icmp sgt i64 [[TMP10]], 3
+; VF1IC4-NEXT: [[TMP15:%.*]] = icmp sgt i64 [[TMP11]], 3
+; VF1IC4-NEXT: [[TMP16:%.*]] = icmp sgt i64 [[TMP12]], 3
+; VF1IC4-NEXT: [[TMP17]] = or i1 [[VEC_PHI4]], [[TMP13]]
+; VF1IC4-NEXT: [[TMP18]] = or i1 [[VEC_PHI5]], [[TMP14]]
+; VF1IC4-NEXT: [[TMP19]] = or i1 [[VEC_PHI6]], [[TMP15]]
+; VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI7]], [[TMP16]]
+; VF1IC4-NEXT: [[TMP21]] = select i1 [[TMP13]], i64 [[OFFSET_IDX]], i64 [[VEC_PHI]]
+; VF1IC4-NEXT: [[TMP22]] = select i1 [[TMP14]], i64 [[TMP2]], i64 [[VEC_PHI1]]
+; VF1IC4-NEXT: [[TMP23]] = select i1 [[TMP15]], i64 [[TMP3]], i64 [[VEC_PHI2]]
+; VF1IC4-NEXT: [[TMP24]] = select i1 [[TMP16]], i64 [[TMP4]], i64 [[VEC_PHI3]]
+; VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF1IC4-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF1IC4-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF1IC4: [[MIDDLE_BLOCK]]:
+; VF1IC4-NEXT: [[TMP26:%.*]] = freeze i1 [[TMP17]]
+; VF1IC4-NEXT: [[TMP27:%.*]] = freeze i1 [[TMP18]]
+; VF1IC4-NEXT: [[TMP28:%.*]] = or i1 [[TMP26]], [[TMP27]]
+; VF1IC4-NEXT: [[TMP29:%.*]] = freeze i1 [[TMP19]]
+; VF1IC4-NEXT: [[TMP30:%.*]] = or i1 [[TMP28]], [[TMP29]]
+; VF1IC4-NEXT: [[TMP31:%.*]] = freeze i1 [[TMP20]]
+; VF1IC4-NEXT: [[TMP32:%.*]] = or i1 [[TMP30]], [[TMP31]]
+; VF1IC4-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP32]]
+; VF1IC4-NEXT: [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP21]], i64 [[TMP22]])
+; VF1IC4-NEXT: [[RDX_MINMAX8:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP23]])
+; VF1IC4-NEXT: [[RDX_MINMAX9:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX8]], i64 [[TMP24]])
+; VF1IC4-NEXT: [[TMP33:%.*]] = select i1 [[ANY_FOUND_FR]], i64 [[RDX_MINMAX9]], i64 [[RDX_START]]
+; VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; VF1IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; VF1IC4: [[SCALAR_PH]]:
+; VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ]
+; VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP33]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; VF1IC4-NEXT: br label %[[FOR_BODY:.*]]
+; VF1IC4: [[FOR_BODY]]:
+; VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; VF1IC4-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; VF1IC4-NEXT: [[TMP34:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
+; VF1IC4-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP34]], 3
+; VF1IC4-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]]
+; VF1IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF1IC4: [[EXIT_LOOPEXIT]]:
+; VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP33]], %[[MIDDLE_BLOCK]] ]
+; VF1IC4-NEXT: br label %[[EXIT]]
+; VF1IC4: [[EXIT]]:
+; VF1IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; VF1IC4-NEXT: ret i64 [[IDX_0_LCSSA]]
;
entry:
%guard = icmp slt i64 %iv_start, %n
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
index 6f4a582f3a842..6989186e23ed2 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
@@ -778,25 +778,53 @@ exit: ; preds = %for.body, %entry
ret i32 %rdx.lcssa
}
-; This test can theoretically be vectorized, but only with a runtime-check.
-; The construct that are introduced by IndVarSimplify is:
-; %1 = trunc i64 %iv to i32
-; However, the loop guard is unsigned:
-; %cmp.not = icmp eq i32 %n, 0
-; We cannot guarantee that %iv won't overflow an i32 value (and hence hit the
-; sentinel value), and need a runtime-check to vectorize this case.
-define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) {
-; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(
+; Vectorized using the AnyOf pattern: tracks whether any condition was true,
+; allowing vectorization without requiring a sentinel to be outside the IV range.
+; The loop guard is unsigned (%cmp.not = icmp eq i32 %n, 0), but the AnyOf
+; pattern handles this case without needing runtime checks.
+define i32 @select_icmp_const_truncated_iv_any_of(ptr %a, i32 %n) {
+; CHECK-VF4IC1-LABEL: define i32 @select_icmp_const_truncated_iv_any_of(
; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) {
; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
; CHECK-VF4IC1-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0
; CHECK-VF4IC1-NEXT: br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]]
; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]:
; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT: [[TMP2]] = or <4 x i1> [[VEC_PHI1]], [[TMP9]]
+; CHECK-VF4IC1-NEXT: [[TMP3]] = select <4 x i1> [[TMP9]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV1]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC1-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP6]]
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]])
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = select i1 [[ANY_FOUND_FR]], i32 [[TMP7]], i32 331
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3
@@ -804,25 +832,89 @@ define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(pt
; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]]
; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]:
-; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: br label %[[EXIT]]
; CHECK-VF4IC1: [[EXIT]]:
; CHECK-VF4IC1-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
; CHECK-VF4IC1-NEXT: ret i32 [[RDX_LCSSA]]
;
-; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(
+; CHECK-VF4IC4-LABEL: define i32 @select_icmp_const_truncated_iv_any_of(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) {
; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
; CHECK-VF4IC4-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0
; CHECK-VF4IC4-NEXT: br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]]
; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]:
; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX1]], i64 4
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX1]], i64 8
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX1]], i64 12
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP27]], align 4
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD8]], splat (i32 3)
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD9]], splat (i32 3)
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD10]], splat (i32 3)
+; CHECK-VF4IC4-NEXT: [[TMP8]] = or <4 x i1> [[VEC_PHI4]], [[TMP4]]
+; CHECK-VF4IC4-NEXT: [[TMP9]] = or <4 x i1> [[VEC_PHI5]], [[TMP5]]
+; CHECK-VF4IC4-NEXT: [[TMP10]] = or <4 x i1> [[VEC_PHI6]], [[TMP6]]
+; CHECK-VF4IC4-NEXT: [[TMP11]] = or <4 x i1> [[VEC_PHI7]], [[TMP7]]
+; CHECK-VF4IC4-NEXT: [[TMP12]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[TMP13]] = select <4 x i1> [[TMP5]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT: [[TMP14]] = select <4 x i1> [[TMP6]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT: [[TMP15]] = select <4 x i1> [[TMP7]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV1]], 16
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-VF4IC4-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP9]]
+; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = or <4 x i1> [[TMP17]], [[TMP18]]
+; CHECK-VF4IC4-NEXT: [[TMP20:%.*]] = freeze <4 x i1> [[TMP10]]
+; CHECK-VF4IC4-NEXT: [[TMP21:%.*]] = or <4 x i1> [[TMP19]], [[TMP20]]
+; CHECK-VF4IC4-NEXT: [[TMP22:%.*]] = freeze <4 x i1> [[TMP11]]
+; CHECK-VF4IC4-NEXT: [[TMP23:%.*]] = or <4 x i1> [[TMP21]], [[TMP22]]
+; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP23]])
+; CHECK-VF4IC4-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP24]]
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX11:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[RDX_MINMAX]], <4 x i32> [[TMP14]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX12:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[RDX_MINMAX11]], <4 x i32> [[TMP15]])
+; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[RDX_MINMAX12]])
+; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = select i1 [[ANY_FOUND_FR]], i32 [[TMP25]], i32 331
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP26]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3
@@ -830,35 +922,99 @@ define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(pt
; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]]
; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]:
-; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP26]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: br label %[[EXIT]]
; CHECK-VF4IC4: [[EXIT]]:
; CHECK-VF4IC4-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
; CHECK-VF4IC4-NEXT: ret i32 [[RDX_LCSSA]]
;
-; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(
+; CHECK-VF1IC4-LABEL: define i32 @select_icmp_const_truncated_iv_any_of(
; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) {
; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]:
; CHECK-VF1IC4-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0
; CHECK-VF1IC4-NEXT: br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]]
; CHECK-VF1IC4: [[FOR_BODY_PREHEADER]]:
; CHECK-VF1IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4: [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4: [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[RDX1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[SPEC_SELECT1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI5:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI6:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI7:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = add i64 [[IV1]], 1
+; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = add i64 [[IV1]], 2
+; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[IV1]], 3
+; CHECK-VF1IC4-NEXT: [[TMP37:%.*]] = trunc i64 [[IV1]] to i32
+; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = add i32 [[TMP37]], 1
+; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = add i32 [[TMP37]], 2
+; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = add i32 [[TMP37]], 3
+; CHECK-VF1IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT: [[TMP36:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-VF1IC4-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[TMP36]], 3
+; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP12]], 3
+; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = icmp sgt i32 [[TMP13]], 3
+; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = icmp sgt i32 [[TMP14]], 3
+; CHECK-VF1IC4-NEXT: [[TMP19]] = or i1 [[VEC_PHI4]], [[CMP2]]
+; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI5]], [[TMP16]]
+; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI6]], [[TMP17]]
+; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI7]], [[TMP18]]
+; CHECK-VF1IC4-NEXT: [[SPEC_SELECT1]] = select i1 [[CMP2]], i32 [[TMP37]], i32 [[RDX1]]
+; CHECK-VF1IC4-NEXT: [[TMP24]] = select i1 [[TMP16]], i32 [[TMP4]], i32 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT: [[TMP25]] = select i1 [[TMP17]], i32 [[TMP5]], i32 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT: [[TMP26]] = select i1 [[TMP18]], i32 [[TMP6]], i32 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV1]], 4
+; CHECK-VF1IC4-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT: br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF1IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT: [[TMP28:%.*]] = freeze i1 [[TMP19]]
+; CHECK-VF1IC4-NEXT: [[TMP29:%.*]] = freeze i1 [[TMP20]]
+; CHECK-VF1IC4-NEXT: [[TMP30:%.*]] = or i1 [[TMP28]], [[TMP29]]
+; CHECK-VF1IC4-NEXT: [[TMP31:%.*]] = freeze i1 [[TMP21]]
+; CHECK-VF1IC4-NEXT: [[TMP32:%.*]] = or i1 [[TMP30]], [[TMP31]]
+; CHECK-VF1IC4-NEXT: [[TMP33:%.*]] = freeze i1 [[TMP22]]
+; CHECK-VF1IC4-NEXT: [[TMP34:%.*]] = or i1 [[TMP32]], [[TMP33]]
+; CHECK-VF1IC4-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP34]]
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[SPEC_SELECT1]], i32 [[TMP24]])
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX8:%.*]] = call i32 @llvm.umax.i32(i32 [[RDX_MINMAX]], i32 [[TMP25]])
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX9:%.*]] = call i32 @llvm.umax.i32(i32 [[RDX_MINMAX8]], i32 [[TMP26]])
+; CHECK-VF1IC4-NEXT: [[TMP35:%.*]] = select i1 [[ANY_FOUND_FR]], i32 [[RDX_MINMAX9]], i32 331
+; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4: [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP35]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF1IC4: [[FOR_BODY]]:
-; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-VF1IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]]
+; CHECK-VF1IC4-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP38]], 3
+; CHECK-VF1IC4-NEXT: [[TMP39:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF1IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP39]], i32 [[RDX]]
; CHECK-VF1IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-VF1IC4: [[EXIT_LOOPEXIT]]:
-; CHECK-VF1IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP35]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF1IC4-NEXT: br label %[[EXIT]]
; CHECK-VF1IC4: [[EXIT]]:
; CHECK-VF1IC4-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
@@ -1064,18 +1220,43 @@ exit: ; preds = %for.body
ret i32 %cond
}
-; Without loop guard, when the constant trip count exceeds the maximum signed
-; value of the reduction type, truncation may cause overflow. Therefore,
-; vectorizer is unable to guarantee that the induction variable is monotonic
-; increasing.
-define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) {
-; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub(
+; The constant trip count exceeds the maximum signed value of the reduction
+; type, so the IV will overflow. Vectorized using the AnyOf pattern which
+; tracks whether any condition was true, avoiding the need for a sentinel
+; value that stays outside the IV range.
+define i32 @select_fcmp_truncated_iv_any_of(ptr %a) {
+; CHECK-VF4IC1-LABEL: define i32 @select_fcmp_truncated_iv_any_of(
; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) {
-; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]]
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-VF4IC1-NEXT: [[TMP2]] = or <4 x i1> [[VEC_PHI1]], [[TMP9]]
+; CHECK-VF4IC1-NEXT: [[TMP3]] = select <4 x i1> [[TMP9]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV1]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
+; CHECK-VF4IC1-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC1-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP6]]
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]])
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = select i1 [[ANY_FOUND_FR]], i32 [[TMP7]], i32 -1
+; CHECK-VF4IC1-NEXT: br label %[[SCALAR_PH:.*]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 2147483648, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[TMP8]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00
@@ -1083,18 +1264,76 @@ define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) {
; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-VF4IC1: [[EXIT]]:
; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
; CHECK-VF4IC1-NEXT: ret i32 [[SPEC_SELECT_LCSSA]]
;
-; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub(
+; CHECK-VF4IC4-LABEL: define i32 @select_fcmp_truncated_iv_any_of(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) {
-; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT: [[ENTRY:.*:]]
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 4
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 8
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 12
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP27]], align 4
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD8]], zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD9]], zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD10]], zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP8]] = or <4 x i1> [[VEC_PHI4]], [[TMP4]]
+; CHECK-VF4IC4-NEXT: [[TMP9]] = or <4 x i1> [[VEC_PHI5]], [[TMP5]]
+; CHECK-VF4IC4-NEXT: [[TMP10]] = or <4 x i1> [[VEC_PHI6]], [[TMP6]]
+; CHECK-VF4IC4-NEXT: [[TMP11]] = or <4 x i1> [[VEC_PHI7]], [[TMP7]]
+; CHECK-VF4IC4-NEXT: [[TMP12]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[TMP13]] = select <4 x i1> [[TMP5]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT: [[TMP14]] = select <4 x i1> [[TMP6]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT: [[TMP15]] = select <4 x i1> [[TMP7]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV1]], 16
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
+; CHECK-VF4IC4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-VF4IC4-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP9]]
+; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = or <4 x i1> [[TMP17]], [[TMP18]]
+; CHECK-VF4IC4-NEXT: [[TMP20:%.*]] = freeze <4 x i1> [[TMP10]]
+; CHECK-VF4IC4-NEXT: [[TMP21:%.*]] = or <4 x i1> [[TMP19]], [[TMP20]]
+; CHECK-VF4IC4-NEXT: [[TMP22:%.*]] = freeze <4 x i1> [[TMP11]]
+; CHECK-VF4IC4-NEXT: [[TMP23:%.*]] = or <4 x i1> [[TMP21]], [[TMP22]]
+; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP23]])
+; CHECK-VF4IC4-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP24]]
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX11:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[RDX_MINMAX]], <4 x i32> [[TMP14]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX12:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[RDX_MINMAX11]], <4 x i32> [[TMP15]])
+; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[RDX_MINMAX12]])
+; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = select i1 [[ANY_FOUND_FR]], i32 [[TMP25]], i32 -1
+; CHECK-VF4IC4-NEXT: br label %[[SCALAR_PH:.*]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 2147483648, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[TMP26]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00
@@ -1102,26 +1341,84 @@ define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) {
; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-VF4IC4: [[EXIT]]:
; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
; CHECK-VF4IC4-NEXT: ret i32 [[SPEC_SELECT_LCSSA]]
;
-; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub(
+; CHECK-VF1IC4-LABEL: define i32 @select_fcmp_truncated_iv_any_of(
; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) {
-; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT: [[ENTRY:.*:]]
+; CHECK-VF1IC4-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4: [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4: [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[RDX1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[SPEC_SELECT1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI5:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI6:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI7:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = add i64 [[IV1]], 1
+; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = add i64 [[IV1]], 2
+; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[IV1]], 3
+; CHECK-VF1IC4-NEXT: [[TMP37:%.*]] = trunc i64 [[IV1]] to i32
+; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = add i32 [[TMP37]], 1
+; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = add i32 [[TMP37]], 2
+; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = add i32 [[TMP37]], 3
+; CHECK-VF1IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
+; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT: [[TMP36:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP8]], align 4
+; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP9]], align 4
+; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP10]], align 4
+; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = fcmp fast olt float [[TMP36]], 0.000000e+00
+; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = fcmp fast olt float [[TMP12]], 0.000000e+00
+; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = fcmp fast olt float [[TMP13]], 0.000000e+00
+; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = fcmp fast olt float [[TMP14]], 0.000000e+00
+; CHECK-VF1IC4-NEXT: [[TMP19]] = or i1 [[VEC_PHI4]], [[CMP1]]
+; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI5]], [[TMP16]]
+; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI6]], [[TMP17]]
+; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI7]], [[TMP18]]
+; CHECK-VF1IC4-NEXT: [[SPEC_SELECT1]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[RDX1]]
+; CHECK-VF1IC4-NEXT: [[TMP24]] = select i1 [[TMP16]], i32 [[TMP4]], i32 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT: [[TMP25]] = select i1 [[TMP17]], i32 [[TMP5]], i32 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT: [[TMP26]] = select i1 [[TMP18]], i32 [[TMP6]], i32 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV1]], 4
+; CHECK-VF1IC4-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
+; CHECK-VF1IC4-NEXT: br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF1IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT: [[TMP28:%.*]] = freeze i1 [[TMP19]]
+; CHECK-VF1IC4-NEXT: [[TMP29:%.*]] = freeze i1 [[TMP20]]
+; CHECK-VF1IC4-NEXT: [[TMP30:%.*]] = or i1 [[TMP28]], [[TMP29]]
+; CHECK-VF1IC4-NEXT: [[TMP31:%.*]] = freeze i1 [[TMP21]]
+; CHECK-VF1IC4-NEXT: [[TMP32:%.*]] = or i1 [[TMP30]], [[TMP31]]
+; CHECK-VF1IC4-NEXT: [[TMP33:%.*]] = freeze i1 [[TMP22]]
+; CHECK-VF1IC4-NEXT: [[TMP34:%.*]] = or i1 [[TMP32]], [[TMP33]]
+; CHECK-VF1IC4-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP34]]
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[SPEC_SELECT1]], i32 [[TMP24]])
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX8:%.*]] = call i32 @llvm.umax.i32(i32 [[RDX_MINMAX]], i32 [[TMP25]])
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX9:%.*]] = call i32 @llvm.umax.i32(i32 [[RDX_MINMAX8]], i32 [[TMP26]])
+; CHECK-VF1IC4-NEXT: [[TMP35:%.*]] = select i1 [[ANY_FOUND_FR]], i32 [[RDX_MINMAX9]], i32 -1
+; CHECK-VF1IC4-NEXT: br label %[[SCALAR_PH:.*]]
+; CHECK-VF1IC4: [[SCALAR_PH]]:
; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF1IC4: [[FOR_BODY]]:
-; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ 2147483648, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[TMP35]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-VF1IC4-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00
-; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-VF1IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
+; CHECK-VF1IC4-NEXT: [[TMP38:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP38]], 0.000000e+00
+; CHECK-VF1IC4-NEXT: [[TMP39:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF1IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP39]], i32 [[RDX]]
; CHECK-VF1IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649
-; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-VF1IC4: [[EXIT]]:
; CHECK-VF1IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
; CHECK-VF1IC4-NEXT: ret i32 [[SPEC_SELECT_LCSSA]]
@@ -1286,187 +1583,58 @@ define i1 @select_with_trunc_i1_iv(i64 %n, i64 %start) {
; CHECK-VF4IC1-LABEL: define i1 @select_with_trunc_i1_iv(
; CHECK-VF4IC1-SAME: i64 [[N:%.*]], i64 [[START:%.*]]) {
; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
-; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
-; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK-VF4IC1: [[VECTOR_PH]]:
-; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
-; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = add i64 [[START]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[START]], i64 0
-; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
-; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-VF4IC1: [[VECTOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[VEC_IND1:%.*]] = phi <4 x i1> [ <i1 false, i1 true, i1 false, i1 true>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-VF4IC1-NEXT: [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i1> [[VEC_PHI]], <4 x i1> [[VEC_IND1]]
-; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT2]] = add <4 x i1> [[VEC_IND1]], zeroinitializer
-; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
-; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.smin.v4i1(<4 x i1> [[TMP3]])
-; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i1 [[TMP5]], false
-; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i1 [[TMP5]], i1 false
-; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK-VF4IC1: [[SCALAR_PH]]:
-; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ false, %[[ENTRY]] ]
; CHECK-VF4IC1-NEXT: br label %[[LOOP:.*]]
; CHECK-VF4IC1: [[LOOP]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-VF4IC1-NEXT: [[CTR:%.*]] = phi i64 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[CTR_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-VF4IC1-NEXT: [[ACCUM:%.*]] = phi i1 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[START]], %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT: [[CTR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[CTR_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT: [[ACCUM:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ]
; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 0
; CHECK-VF4IC1-NEXT: [[TRUNC:%.*]] = trunc i64 [[CTR]] to i1
; CHECK-VF4IC1-NEXT: [[SEL]] = select i1 [[CMP]], i1 [[ACCUM]], i1 [[TRUNC]]
; CHECK-VF4IC1-NEXT: [[CTR_NEXT]] = add i64 [[CTR]], 1
; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-VF4IC1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[N]], [[CTR]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
; CHECK-VF4IC1: [[EXIT]]:
-; CHECK-VF4IC1-NEXT: [[SEL_LCSSA:%.*]] = phi i1 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT: [[SEL_LCSSA:%.*]] = phi i1 [ [[SEL]], %[[LOOP]] ]
; CHECK-VF4IC1-NEXT: ret i1 [[SEL_LCSSA]]
;
; CHECK-VF4IC4-LABEL: define i1 @select_with_trunc_i1_iv(
; CHECK-VF4IC4-SAME: i64 [[N:%.*]], i64 [[START:%.*]]) {
; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
-; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
-; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK-VF4IC4: [[VECTOR_PH]]:
-; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
-; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[START]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[START]], i64 0
-; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
-; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-VF4IC4: [[VECTOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_IND4:%.*]] = phi <4 x i1> [ <i1 false, i1 true, i1 false, i1 true>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; CHECK-VF4IC4-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
-; CHECK-VF4IC4-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
-; CHECK-VF4IC4-NEXT: [[STEP_ADD5:%.*]] = add <4 x i1> [[VEC_IND4]], zeroinitializer
-; CHECK-VF4IC4-NEXT: [[STEP_ADD_26:%.*]] = add <4 x i1> [[STEP_ADD5]], zeroinitializer
-; CHECK-VF4IC4-NEXT: [[STEP_ADD_37:%.*]] = add <4 x i1> [[STEP_ADD_26]], zeroinitializer
-; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[STEP_ADD]], zeroinitializer
-; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[STEP_ADD_2]], zeroinitializer
-; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[STEP_ADD_3]], zeroinitializer
-; CHECK-VF4IC4-NEXT: [[TMP6]] = select <4 x i1> [[TMP2]], <4 x i1> [[VEC_PHI]], <4 x i1> [[VEC_IND4]]
-; CHECK-VF4IC4-NEXT: [[TMP7]] = select <4 x i1> [[TMP3]], <4 x i1> [[VEC_PHI1]], <4 x i1> [[STEP_ADD5]]
-; CHECK-VF4IC4-NEXT: [[TMP8]] = select <4 x i1> [[TMP4]], <4 x i1> [[VEC_PHI2]], <4 x i1> [[STEP_ADD_26]]
-; CHECK-VF4IC4-NEXT: [[TMP9]] = select <4 x i1> [[TMP5]], <4 x i1> [[VEC_PHI3]], <4 x i1> [[STEP_ADD_37]]
-; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
-; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT8]] = add <4 x i1> [[STEP_ADD_37]], zeroinitializer
-; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
-; CHECK-VF4IC4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i1> @llvm.smin.v4i1(<4 x i1> [[TMP6]], <4 x i1> [[TMP7]])
-; CHECK-VF4IC4-NEXT: [[RDX_MINMAX9:%.*]] = call <4 x i1> @llvm.smin.v4i1(<4 x i1> [[RDX_MINMAX]], <4 x i1> [[TMP8]])
-; CHECK-VF4IC4-NEXT: [[RDX_MINMAX10:%.*]] = call <4 x i1> @llvm.smin.v4i1(<4 x i1> [[RDX_MINMAX9]], <4 x i1> [[TMP9]])
-; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.smin.v4i1(<4 x i1> [[RDX_MINMAX10]])
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i1 [[TMP11]], false
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i1 [[TMP11]], i1 false
-; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK-VF4IC4: [[SCALAR_PH]]:
-; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL11:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ false, %[[ENTRY]] ]
; CHECK-VF4IC4-NEXT: br label %[[LOOP:.*]]
; CHECK-VF4IC4: [[LOOP]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-VF4IC4-NEXT: [[CTR:%.*]] = phi i64 [ [[BC_RESUME_VAL11]], %[[SCALAR_PH]] ], [ [[CTR_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-VF4IC4-NEXT: [[ACCUM:%.*]] = phi i1 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[START]], %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT: [[CTR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[CTR_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT: [[ACCUM:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ]
; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 0
; CHECK-VF4IC4-NEXT: [[TRUNC:%.*]] = trunc i64 [[CTR]] to i1
; CHECK-VF4IC4-NEXT: [[SEL]] = select i1 [[CMP]], i1 [[ACCUM]], i1 [[TRUNC]]
; CHECK-VF4IC4-NEXT: [[CTR_NEXT]] = add i64 [[CTR]], 1
; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-VF4IC4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[N]], [[CTR]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
; CHECK-VF4IC4: [[EXIT]]:
-; CHECK-VF4IC4-NEXT: [[SEL_LCSSA:%.*]] = phi i1 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT: [[SEL_LCSSA:%.*]] = phi i1 [ [[SEL]], %[[LOOP]] ]
; CHECK-VF4IC4-NEXT: ret i1 [[SEL_LCSSA]]
;
; CHECK-VF1IC4-LABEL: define i1 @select_with_trunc_i1_iv(
; CHECK-VF1IC4-SAME: i64 [[N:%.*]], i64 [[START:%.*]]) {
; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]:
-; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
-; CHECK-VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
-; CHECK-VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK-VF1IC4: [[VECTOR_PH]]:
-; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
-; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = add i64 [[START]], [[N_VEC]]
-; CHECK-VF1IC4-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-VF1IC4: [[VECTOR_BODY]]:
-; CHECK-VF1IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF1IC4-NEXT: [[VEC_PHI:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF1IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[START]], [[INDEX]]
-; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 1
-; CHECK-VF1IC4-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 2
-; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 3
-; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = trunc i64 [[INDEX]] to i1
-; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = add i1 [[TMP5]], true
-; CHECK-VF1IC4-NEXT: [[TMP7:%.*]] = add i1 [[TMP5]], false
-; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = add i1 [[TMP5]], true
-; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[OFFSET_IDX]], 0
-; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP2]], 0
-; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = icmp eq i64 [[TMP3]], 0
-; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = icmp eq i64 [[TMP4]], 0
-; CHECK-VF1IC4-NEXT: [[TMP13]] = select i1 [[TMP9]], i1 [[VEC_PHI]], i1 [[TMP5]]
-; CHECK-VF1IC4-NEXT: [[TMP14]] = select i1 [[TMP10]], i1 [[VEC_PHI1]], i1 [[TMP6]]
-; CHECK-VF1IC4-NEXT: [[TMP15]] = select i1 [[TMP11]], i1 [[VEC_PHI2]], i1 [[TMP7]]
-; CHECK-VF1IC4-NEXT: [[TMP16]] = select i1 [[TMP12]], i1 [[VEC_PHI3]], i1 [[TMP8]]
-; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF1IC4-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-VF1IC4: [[MIDDLE_BLOCK]]:
-; CHECK-VF1IC4-NEXT: [[RDX_MINMAX:%.*]] = call i1 @llvm.smin.i1(i1 [[TMP13]], i1 [[TMP14]])
-; CHECK-VF1IC4-NEXT: [[RDX_MINMAX4:%.*]] = call i1 @llvm.smin.i1(i1 [[RDX_MINMAX]], i1 [[TMP15]])
-; CHECK-VF1IC4-NEXT: [[RDX_MINMAX5:%.*]] = call i1 @llvm.smin.i1(i1 [[RDX_MINMAX4]], i1 [[TMP16]])
-; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i1 [[RDX_MINMAX5]], false
-; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i1 [[RDX_MINMAX5]], i1 false
-; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK-VF1IC4: [[SCALAR_PH]]:
-; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
-; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ false, %[[ENTRY]] ]
; CHECK-VF1IC4-NEXT: br label %[[LOOP:.*]]
; CHECK-VF1IC4: [[LOOP]]:
-; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-VF1IC4-NEXT: [[CTR:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ], [ [[CTR_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-VF1IC4-NEXT: [[ACCUM:%.*]] = phi i1 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[START]], %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT: [[CTR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[CTR_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT: [[ACCUM:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ]
; CHECK-VF1IC4-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 0
; CHECK-VF1IC4-NEXT: [[TRUNC:%.*]] = trunc i64 [[CTR]] to i1
; CHECK-VF1IC4-NEXT: [[SEL]] = select i1 [[CMP]], i1 [[ACCUM]], i1 [[TRUNC]]
; CHECK-VF1IC4-NEXT: [[CTR_NEXT]] = add i64 [[CTR]], 1
; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-VF1IC4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[N]], [[CTR]]
-; CHECK-VF1IC4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-VF1IC4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
; CHECK-VF1IC4: [[EXIT]]:
-; CHECK-VF1IC4-NEXT: [[SEL_LCSSA:%.*]] = phi i1 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT: [[SEL_LCSSA:%.*]] = phi i1 [ [[SEL]], %[[LOOP]] ]
; CHECK-VF1IC4-NEXT: ret i1 [[SEL_LCSSA]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
index 7f947251ed3e4..bd8f43e1c9b60 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
@@ -1953,11 +1953,46 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound(
; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = add i64 -9223372036854775808, [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775807, i64 -9223372036854775806, i64 -9223372036854775805>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-VF4IC1-NEXT: [[TMP4]] = or <4 x i1> [[VEC_PHI1]], [[TMP3]]
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; CHECK-VF4IC1-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP8]]
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP5]])
+; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = select i1 [[ANY_FOUND_FR]], i64 [[TMP9]], i64 [[RDX_START]]
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
@@ -1967,19 +2002,93 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV_I]], 1
; CHECK-VF4IC1-NEXT: [[INC3]] = add nsw i64 [[IV_J]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-VF4IC1: [[EXIT]]:
-; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: ret i64 [[COND_LCSSA]]
;
; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: [[TMP32:%.*]] = add i64 -9223372036854775808, [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775807, i64 -9223372036854775806, i64 -9223372036854775805>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[TMP33:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP33]], i64 4
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP33]], i64 8
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP33]], i64 12
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP33]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 4
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 8
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 12
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD11]]
+; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD8]], [[WIDE_LOAD12]]
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD9]], [[WIDE_LOAD13]]
+; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD10]], [[WIDE_LOAD14]]
+; CHECK-VF4IC4-NEXT: [[TMP13]] = or <4 x i1> [[VEC_PHI4]], [[TMP9]]
+; CHECK-VF4IC4-NEXT: [[TMP14]] = or <4 x i1> [[VEC_PHI5]], [[TMP10]]
+; CHECK-VF4IC4-NEXT: [[TMP15]] = or <4 x i1> [[VEC_PHI6]], [[TMP11]]
+; CHECK-VF4IC4-NEXT: [[TMP16]] = or <4 x i1> [[VEC_PHI7]], [[TMP12]]
+; CHECK-VF4IC4-NEXT: [[TMP17]] = select <4 x i1> [[TMP9]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[TMP18]] = select <4 x i1> [[TMP10]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT: [[TMP19]] = select <4 x i1> [[TMP11]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT: [[TMP20]] = select <4 x i1> [[TMP12]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP22:%.*]] = freeze <4 x i1> [[TMP13]]
+; CHECK-VF4IC4-NEXT: [[TMP23:%.*]] = freeze <4 x i1> [[TMP14]]
+; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = or <4 x i1> [[TMP22]], [[TMP23]]
+; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = freeze <4 x i1> [[TMP15]]
+; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = or <4 x i1> [[TMP24]], [[TMP25]]
+; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = freeze <4 x i1> [[TMP16]]
+; CHECK-VF4IC4-NEXT: [[TMP28:%.*]] = or <4 x i1> [[TMP26]], [[TMP27]]
+; CHECK-VF4IC4-NEXT: [[TMP29:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP28]])
+; CHECK-VF4IC4-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP29]]
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP17]], <4 x i64> [[TMP18]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX15:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP19]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX16:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX15]], <4 x i64> [[TMP20]])
+; CHECK-VF4IC4-NEXT: [[TMP30:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX16]])
+; CHECK-VF4IC4-NEXT: [[TMP31:%.*]] = select i1 [[ANY_FOUND_FR]], i64 [[TMP30]], i64 [[RDX_START]]
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP32]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL17:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP31]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL17]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
@@ -1989,31 +2098,105 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV_I]], 1
; CHECK-VF4IC4-NEXT: [[INC3]] = add nsw i64 [[IV_J]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-VF4IC4: [[EXIT]]:
-; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP31]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: ret i64 [[COND_LCSSA]]
;
; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound(
; CHECK-VF1IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]:
-; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK-VF1IC4: [[FOR_BODY]]:
-; CHECK-VF1IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ]
-; CHECK-VF1IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4: [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT: [[TMP7:%.*]] = add i64 -9223372036854775808, [[N_VEC]]
+; CHECK-VF1IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4: [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT: [[IV_I:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[COND:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP32:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP33:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP34:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI5:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP28:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI6:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI7:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP30:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[IV_J:%.*]] = add i64 -9223372036854775808, [[IV_I]]
+; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = add i64 [[IV_J]], 1
+; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[IV_J]], 2
+; CHECK-VF1IC4-NEXT: [[TMP3:%.*]] = add i64 [[IV_J]], 3
+; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = add i64 [[IV_I]], 1
+; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = add i64 [[IV_I]], 2
+; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = add i64 [[IV_I]], 3
; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
+; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
; CHECK-VF1IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
+; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP5]]
+; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP6]]
; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP16]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP17]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP18]], align 8
; CHECK-VF1IC4-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp sgt i64 [[TMP12]], [[TMP20]]
+; CHECK-VF1IC4-NEXT: [[TMP25:%.*]] = icmp sgt i64 [[TMP13]], [[TMP21]]
+; CHECK-VF1IC4-NEXT: [[TMP26:%.*]] = icmp sgt i64 [[TMP14]], [[TMP22]]
+; CHECK-VF1IC4-NEXT: [[TMP27]] = or i1 [[VEC_PHI4]], [[CMP2]]
+; CHECK-VF1IC4-NEXT: [[TMP28]] = or i1 [[VEC_PHI5]], [[TMP24]]
+; CHECK-VF1IC4-NEXT: [[TMP29]] = or i1 [[VEC_PHI6]], [[TMP25]]
+; CHECK-VF1IC4-NEXT: [[TMP30]] = or i1 [[VEC_PHI7]], [[TMP26]]
; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV_J]], i64 [[RDX]]
-; CHECK-VF1IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV_I]], 1
-; CHECK-VF1IC4-NEXT: [[INC3]] = add nsw i64 [[IV_J]], 1
+; CHECK-VF1IC4-NEXT: [[TMP32]] = select i1 [[TMP24]], i64 [[TMP11]], i64 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT: [[TMP33]] = select i1 [[TMP25]], i64 [[TMP2]], i64 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT: [[TMP34]] = select i1 [[TMP26]], i64 [[TMP3]], i64 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV_I]], 4
+; CHECK-VF1IC4-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT: br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-VF1IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT: [[TMP36:%.*]] = freeze i1 [[TMP27]]
+; CHECK-VF1IC4-NEXT: [[TMP37:%.*]] = freeze i1 [[TMP28]]
+; CHECK-VF1IC4-NEXT: [[TMP38:%.*]] = or i1 [[TMP36]], [[TMP37]]
+; CHECK-VF1IC4-NEXT: [[TMP39:%.*]] = freeze i1 [[TMP29]]
+; CHECK-VF1IC4-NEXT: [[TMP40:%.*]] = or i1 [[TMP38]], [[TMP39]]
+; CHECK-VF1IC4-NEXT: [[TMP41:%.*]] = freeze i1 [[TMP30]]
+; CHECK-VF1IC4-NEXT: [[TMP42:%.*]] = or i1 [[TMP40]], [[TMP41]]
+; CHECK-VF1IC4-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP42]]
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[COND]], i64 [[TMP32]])
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX8:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP33]])
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX9:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX8]], i64 [[TMP34]])
+; CHECK-VF1IC4-NEXT: [[TMP43:%.*]] = select i1 [[ANY_FOUND_FR]], i64 [[RDX_MINMAX9]], i64 [[RDX_START]]
+; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4: [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP43]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4: [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT: [[IV_J1:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT: [[IV_I1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL10]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT: [[RDX1:%.*]] = phi i64 [ [[COND1:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I1]]
+; CHECK-VF1IC4-NEXT: [[TMP44:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; CHECK-VF1IC4-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I1]]
+; CHECK-VF1IC4-NEXT: [[TMP45:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; CHECK-VF1IC4-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP44]], [[TMP45]]
+; CHECK-VF1IC4-NEXT: [[COND1]] = select i1 [[CMP3]], i64 [[IV_J1]], i64 [[RDX1]]
+; CHECK-VF1IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV_I1]], 1
+; CHECK-VF1IC4-NEXT: [[INC3]] = add nsw i64 [[IV_J1]], 1
; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-VF1IC4: [[EXIT]]:
-; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND1]], %[[FOR_BODY]] ], [ [[TMP43]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF1IC4-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
@@ -2042,10 +2225,49 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value(
; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[IVSTART]]
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP12]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP12]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP12]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = add i64 [[IVSTART]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IVSTART]], [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-VF4IC1-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI1]], [[TMP4]]
+; CHECK-VF4IC1-NEXT: [[TMP6]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP5]]
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
+; CHECK-VF4IC1-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP9]]
+; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP6]])
+; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = select i1 [[ANY_FOUND_FR]], i64 [[TMP10]], i64 [[RDX_START]]
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
@@ -2054,18 +2276,96 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK-VF4IC1: [[EXIT]]:
-; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: ret i64 [[COND_LCSSA]]
;
; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT: [[TMP33:%.*]] = sub i64 [[N]], [[IVSTART]]
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP33]], 16
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP33]], 16
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP33]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: [[TMP34:%.*]] = add i64 [[IVSTART]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IVSTART]], [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 4
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 8
+; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 12
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i64 4
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i64 8
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i64 12
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD11]]
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD8]], [[WIDE_LOAD12]]
+; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD9]], [[WIDE_LOAD13]]
+; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD10]], [[WIDE_LOAD14]]
+; CHECK-VF4IC4-NEXT: [[TMP14]] = or <4 x i1> [[VEC_PHI4]], [[TMP10]]
+; CHECK-VF4IC4-NEXT: [[TMP15]] = or <4 x i1> [[VEC_PHI5]], [[TMP11]]
+; CHECK-VF4IC4-NEXT: [[TMP16]] = or <4 x i1> [[VEC_PHI6]], [[TMP12]]
+; CHECK-VF4IC4-NEXT: [[TMP17]] = or <4 x i1> [[VEC_PHI7]], [[TMP13]]
+; CHECK-VF4IC4-NEXT: [[TMP18]] = select <4 x i1> [[TMP10]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[TMP19]] = select <4 x i1> [[TMP11]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT: [[TMP20]] = select <4 x i1> [[TMP12]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT: [[TMP21]] = select <4 x i1> [[TMP13]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP23:%.*]] = freeze <4 x i1> [[TMP14]]
+; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = freeze <4 x i1> [[TMP15]]
+; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = or <4 x i1> [[TMP23]], [[TMP24]]
+; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = freeze <4 x i1> [[TMP16]]
+; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = or <4 x i1> [[TMP25]], [[TMP26]]
+; CHECK-VF4IC4-NEXT: [[TMP28:%.*]] = freeze <4 x i1> [[TMP17]]
+; CHECK-VF4IC4-NEXT: [[TMP29:%.*]] = or <4 x i1> [[TMP27]], [[TMP28]]
+; CHECK-VF4IC4-NEXT: [[TMP30:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP29]])
+; CHECK-VF4IC4-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP30]]
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP18]], <4 x i64> [[TMP19]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX15:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP20]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX16:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX15]], <4 x i64> [[TMP21]])
+; CHECK-VF4IC4-NEXT: [[TMP31:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX16]])
+; CHECK-VF4IC4-NEXT: [[TMP32:%.*]] = select i1 [[ANY_FOUND_FR]], i64 [[TMP31]], i64 [[RDX_START]]
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP33]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP34]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP32]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
@@ -2074,29 +2374,100 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK-VF4IC4: [[EXIT]]:
-; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP32]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: ret i64 [[COND_LCSSA]]
;
; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value(
; CHECK-VF1IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]:
-; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK-VF1IC4: [[FOR_BODY]]:
-; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ]
-; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[IVSTART]]
+; CHECK-VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP5]], 4
+; CHECK-VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4: [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], 4
+; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP5]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = add i64 [[IVSTART]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4: [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[COND:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP30:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP31:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP32:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI5:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI6:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI7:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP28:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[IV:%.*]] = add i64 [[IVSTART]], [[INDEX]]
+; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT: [[TMP3:%.*]] = add i64 [[IV]], 2
+; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = add i64 [[IV]], 3
; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
; CHECK-VF1IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP16]], align 8
; CHECK-VF1IC4-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-VF1IC4-NEXT: [[TMP22:%.*]] = icmp sgt i64 [[TMP10]], [[TMP18]]
+; CHECK-VF1IC4-NEXT: [[TMP23:%.*]] = icmp sgt i64 [[TMP11]], [[TMP19]]
+; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp sgt i64 [[TMP12]], [[TMP20]]
+; CHECK-VF1IC4-NEXT: [[TMP25]] = or i1 [[VEC_PHI4]], [[CMP2]]
+; CHECK-VF1IC4-NEXT: [[TMP26]] = or i1 [[VEC_PHI5]], [[TMP22]]
+; CHECK-VF1IC4-NEXT: [[TMP27]] = or i1 [[VEC_PHI6]], [[TMP23]]
+; CHECK-VF1IC4-NEXT: [[TMP28]] = or i1 [[VEC_PHI7]], [[TMP24]]
; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
-; CHECK-VF1IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT: [[TMP30]] = select i1 [[TMP22]], i64 [[TMP2]], i64 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT: [[TMP31]] = select i1 [[TMP23]], i64 [[TMP3]], i64 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT: [[TMP32]] = select i1 [[TMP24]], i64 [[TMP4]], i64 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT: br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-VF1IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT: [[TMP34:%.*]] = freeze i1 [[TMP25]]
+; CHECK-VF1IC4-NEXT: [[TMP35:%.*]] = freeze i1 [[TMP26]]
+; CHECK-VF1IC4-NEXT: [[TMP36:%.*]] = or i1 [[TMP34]], [[TMP35]]
+; CHECK-VF1IC4-NEXT: [[TMP37:%.*]] = freeze i1 [[TMP27]]
+; CHECK-VF1IC4-NEXT: [[TMP38:%.*]] = or i1 [[TMP36]], [[TMP37]]
+; CHECK-VF1IC4-NEXT: [[TMP39:%.*]] = freeze i1 [[TMP28]]
+; CHECK-VF1IC4-NEXT: [[TMP40:%.*]] = or i1 [[TMP38]], [[TMP39]]
+; CHECK-VF1IC4-NEXT: [[ANY_FOUND_FR:%.*]] = freeze i1 [[TMP40]]
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[COND]], i64 [[TMP30]])
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX8:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP31]])
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX9:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX8]], i64 [[TMP32]])
+; CHECK-VF1IC4-NEXT: [[TMP41:%.*]] = select i1 [[ANY_FOUND_FR]], i64 [[RDX_MINMAX9]], i64 [[RDX_START]]
+; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP5]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4: [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP41]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4: [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT: [[RDX1:%.*]] = phi i64 [ [[COND1:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
+; CHECK-VF1IC4-NEXT: [[TMP42:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; CHECK-VF1IC4-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV1]]
+; CHECK-VF1IC4-NEXT: [[TMP43:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; CHECK-VF1IC4-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP42]], [[TMP43]]
+; CHECK-VF1IC4-NEXT: [[COND1]] = select i1 [[CMP3]], i64 [[IV1]], i64 [[RDX1]]
+; CHECK-VF1IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK-VF1IC4: [[EXIT]]:
-; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND1]], %[[FOR_BODY]] ], [ [[TMP41]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF1IC4-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/select-smax-last-index.ll b/llvm/test/Transforms/LoopVectorize/select-smax-last-index.ll
index 2117704950b84..4cef774f2a736 100644
--- a/llvm/test/Transforms/LoopVectorize/select-smax-last-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-smax-last-index.ll
@@ -537,18 +537,18 @@ define i64 @test_no_vectorize_select_iv_decrement(ptr %src) {
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1000, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MAX_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp sle i64 [[MIN_VAL]], [[L]]
-; CHECK-NEXT: [[MIN_VAL_NEXT]] = tail call i64 @llvm.smax.i64(i64 [[MIN_VAL]], i64 [[L]])
-; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp sle i64 [[MAX_VAL]], [[L]]
+; CHECK-NEXT: [[MAX_VAL_NEXT]] = tail call i64 @llvm.smax.i64(i64 [[MAX_VAL]], i64 [[L]])
+; CHECK-NEXT: [[MAX_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MAX_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], -1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -579,18 +579,18 @@ define i64 @test_no_vectorize_select_iv_sub(ptr %src) {
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1000, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MAX_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp sle i64 [[MIN_VAL]], [[L]]
-; CHECK-NEXT: [[MIN_VAL_NEXT]] = tail call i64 @llvm.smax.i64(i64 [[MIN_VAL]], i64 [[L]])
-; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp sle i64 [[MAX_VAL]], [[L]]
+; CHECK-NEXT: [[MAX_VAL_NEXT]] = tail call i64 @llvm.smax.i64(i64 [[MAX_VAL]], i64 [[L]])
+; CHECK-NEXT: [[MAX_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MAX_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = sub i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/select-umax-last-index.ll b/llvm/test/Transforms/LoopVectorize/select-umax-last-index.ll
index 1bb3d017a67cd..0fcceab1bf52c 100644
--- a/llvm/test/Transforms/LoopVectorize/select-umax-last-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-umax-last-index.ll
@@ -537,18 +537,18 @@ define i64 @test_no_vectorize_select_iv_decrement(ptr %src) {
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1000, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MAX_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp ule i64 [[MIN_VAL]], [[L]]
-; CHECK-NEXT: [[MIN_VAL_NEXT]] = tail call i64 @llvm.umax.i64(i64 [[MIN_VAL]], i64 [[L]])
-; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ule i64 [[MAX_VAL]], [[L]]
+; CHECK-NEXT: [[MAX_VAL_NEXT]] = tail call i64 @llvm.umax.i64(i64 [[MAX_VAL]], i64 [[L]])
+; CHECK-NEXT: [[MAX_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MAX_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], -1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -579,18 +579,18 @@ define i64 @test_no_vectorize_select_iv_sub(ptr %src) {
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1000, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MAX_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp ule i64 [[MIN_VAL]], [[L]]
-; CHECK-NEXT: [[MIN_VAL_NEXT]] = tail call i64 @llvm.umax.i64(i64 [[MIN_VAL]], i64 [[L]])
-; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ule i64 [[MAX_VAL]], [[L]]
+; CHECK-NEXT: [[MAX_VAL_NEXT]] = tail call i64 @llvm.umax.i64(i64 [[MAX_VAL]], i64 [[L]])
+; CHECK-NEXT: [[MAX_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MAX_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = sub i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 8e160e57b7703..f781496ff89eb 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -239,12 +239,14 @@ define i64 @find_last_iv(ptr %a, i64 %n, i64 %start) {
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[RDX_RES:%.+]]> = compute-find-iv-result ir<%rdx>, ir<%start>, ir<-9223372036854775808>, ir<%cond>
+; CHECK-NEXT: EMIT vp<[[EXT_PART:%.+]]> = extract-last-part vp<[[RDX_RES]]>
+; CHECK-NEXT: EMIT vp<[[EXT_LANE:%.+]]> = extract-last-lane vp<[[EXT_PART]]>
; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%n>, vp<{{.+}}>
; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
; CHECK-EMPTY:
; CHECK: ir-bb<exit>:
-; CHECK-NEXT: IR %cond.lcssa = phi i64 [ %cond, %loop ] (extra operand: vp<[[RDX_RES]]> from middle.block)
+; CHECK-NEXT: IR %cond.lcssa = phi i64 [ %cond, %loop ] (extra operand: vp<[[EXT_LANE]]> from middle.block)
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
More information about the llvm-commits
mailing list