[llvm] [LV] Vectorize conditional scalar assignments (PR #158088)
Graham Hunter via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 31 09:30:22 PDT 2025
https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/158088
>From abea562a6ebe4f0e53c2cb8b4ea2cd14a6961b3f Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 26 Mar 2025 11:35:01 +0000
Subject: [PATCH] [LV] Vectorize conditional scalar assignments
Based on Michael Maitland's previous work:
https://github.com/llvm/llvm-project/pull/121222
This PR uses the existing recurrences code instead of introducing a
new pass just for CSA autovec. I've also made recipes that are more
generic.
I've enabled it by default to see the impact on tests; if there are
regressions we can put it behind a cli option.
---
llvm/include/llvm/Analysis/IVDescriptors.h | 23 +-
llvm/lib/Analysis/IVDescriptors.cpp | 45 +-
.../AArch64/AArch64TargetTransformInfo.cpp | 1 +
.../Transforms/Vectorize/LoopVectorize.cpp | 29 +-
.../Transforms/Vectorize/SLPVectorizer.cpp | 3 +
llvm/lib/Transforms/Vectorize/VPlan.h | 2 +
.../Transforms/Vectorize/VPlanAnalysis.cpp | 3 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 26 +
.../Transforms/Vectorize/VPlanTransforms.cpp | 83 +++
.../Transforms/Vectorize/VPlanTransforms.h | 8 +
.../AArch64/conditional-scalar-assignment.ll | 95 +++
.../conditional-scalar-assignment-vplan.ll | 125 ++++
.../LoopVectorize/iv-select-cmp-decreasing.ll | 336 +++++++++--
.../LoopVectorize/iv-select-cmp-no-wrap.ll | 92 ++-
.../iv-select-cmp-non-const-iv-start.ll | 381 ++++++++++--
.../LoopVectorize/iv-select-cmp-trunc.ll | 566 ++++++++++++++----
.../Transforms/LoopVectorize/iv-select-cmp.ll | 198 +++++-
.../Transforms/LoopVectorize/select-cmp.ll | 145 ++++-
18 files changed, 1895 insertions(+), 266 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 654a5f10cea96..370c301731337 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -70,6 +70,9 @@ enum class RecurKind {
FindLastIVUMax, ///< FindLast reduction with select(cmp(),x,y) where one of
///< (x,y) is increasing loop induction, and both x and y
///< are integer type, producing a UMax reduction.
+ FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y
+ ///< are an integer type, one is the current recurrence value,
+ ///< and the other is an arbitrary value.
// clang-format on
// TODO: Any_of and FindLast reduction need not be restricted to integer type
// only.
@@ -175,13 +178,12 @@ class RecurrenceDescriptor {
/// Returns a struct describing whether the instruction is either a
/// Select(ICmp(A, B), X, Y), or
/// Select(FCmp(A, B), X, Y)
- /// where one of (X, Y) is an increasing (FindLast) or decreasing (FindFirst)
- /// loop induction variable, and the other is a PHI value.
- // TODO: Support non-monotonic variable. FindLast does not need be restricted
- // to increasing loop induction variables.
- LLVM_ABI static InstDesc isFindIVPattern(RecurKind Kind, Loop *TheLoop,
- PHINode *OrigPhi, Instruction *I,
- ScalarEvolution &SE);
+ /// where one of (X, Y) is an increasing (FindLastIV) or decreasing
+ /// (FindFirstIV) loop induction variable, or an arbitrary integer value
+ /// (FindLast), and the other is a PHI value.
+ LLVM_ABI static InstDesc isFindPattern(RecurKind Kind, Loop *TheLoop,
+ PHINode *OrigPhi, Instruction *I,
+ ScalarEvolution &SE);
/// Returns a struct describing if the instruction is a
/// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
@@ -305,6 +307,13 @@ class RecurrenceDescriptor {
isFindLastIVRecurrenceKind(Kind);
}
+ /// Returns true if the recurrence kind is of the form
+ /// select(cmp(),x,y) where one of (x,y) is an arbitrary value and the
+ /// other is a recurrence.
+ static bool isFindLastRecurrenceKind(RecurKind Kind) {
+ return Kind == RecurKind::FindLast;
+ }
+
/// Returns the type of the recurrence. This type can be narrower than the
/// actual type of the Phi if the recurrence has been type-promoted.
Type *getRecurrenceType() const { return RecurrenceType; }
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 9f8ac6e8e2e0b..4a714682b5d5e 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -56,6 +56,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ // TODO: Make type-agnostic.
+ case RecurKind::FindLast:
return true;
}
return false;
@@ -691,9 +693,9 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi,
// value of the data type or a non-constant value by using mask and multiple
// reduction operations.
RecurrenceDescriptor::InstDesc
-RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
- PHINode *OrigPhi, Instruction *I,
- ScalarEvolution &SE) {
+RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop,
+ PHINode *OrigPhi, Instruction *I,
+ ScalarEvolution &SE) {
// TODO: Support the vectorization of FindLastIV when the reduction phi is
// used by more than one select instruction. This vectorization is only
// performed when the SCEV of each increasing induction variable used by the
@@ -702,8 +704,10 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
return InstDesc(false, I);
// We are looking for selects of the form:
- // select(cmp(), phi, loop_induction) or
- // select(cmp(), loop_induction, phi)
+ // select(cmp(), phi, value) or
+ // select(cmp(), value, phi)
+ // where 'value' might be a loop induction variable
+ // (for FindFirstIV/FindLastIV) or an arbitrary value (for FindLast).
// TODO: Match selects with multi-use cmp conditions.
Value *NonRdxPhi = nullptr;
if (!match(I, m_CombineOr(m_Select(m_OneUse(m_Cmp()), m_Value(NonRdxPhi),
@@ -712,6 +716,25 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
m_Value(NonRdxPhi)))))
return InstDesc(false, I);
+ if (isFindLastRecurrenceKind(Kind)) {
+ // Must be an integer scalar.
+ Type *Type = OrigPhi->getType();
+ if (!Type->isIntegerTy() && !Type->isPointerTy())
+ return InstDesc(false, I);
+
+ // FIXME: Support more complex patterns, including multiple selects.
+ // The Select must be used only outside the loop and by the PHI.
+ for (User *U : I->users()) {
+ if (U == OrigPhi)
+ continue;
+ if (auto *UI = dyn_cast<Instruction>(U); UI && !TheLoop->contains(UI))
+ continue;
+ return InstDesc(false, I);
+ }
+
+ return InstDesc(I, RecurKind::FindLast);
+ }
+
// Returns either FindFirstIV/FindLastIV, if such a pattern is found, or
// std::nullopt.
auto GetRecurKind = [&](Value *V) -> std::optional<RecurKind> {
@@ -920,8 +943,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
Kind == RecurKind::Add || Kind == RecurKind::Mul ||
Kind == RecurKind::Sub || Kind == RecurKind::AddChainWithSubs)
return isConditionalRdxPattern(I);
- if (isFindIVRecurrenceKind(Kind) && SE)
- return isFindIVPattern(Kind, L, OrigPhi, I, *SE);
+ if ((isFindIVRecurrenceKind(Kind) || isFindLastRecurrenceKind(Kind)) && SE)
+ return isFindPattern(Kind, L, OrigPhi, I, *SE);
[[fallthrough]];
case Instruction::FCmp:
case Instruction::ICmp:
@@ -1118,7 +1141,11 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
<< "\n");
return true;
}
-
+ if (AddReductionVar(Phi, RecurKind::FindLast, TheLoop, FMF, RedDes, DB, AC,
+ DT, SE)) {
+ LLVM_DEBUG(dbgs() << "Found a FindLast reduction PHI." << *Phi << "\n");
+ return true;
+ }
// Not a reduction of known type.
return false;
}
@@ -1248,6 +1275,8 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
case RecurKind::FMaximumNum:
case RecurKind::FMinimumNum:
return Instruction::FCmp;
+ case RecurKind::FindLast:
+ return Instruction::Select;
default:
llvm_unreachable("Unknown recurrence operation");
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 47c1ac4b22224..065bb17317bb8 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5373,6 +5373,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
case RecurKind::FMax:
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
+ case RecurKind::FindLast:
return true;
default:
return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 25bf49db0e073..58313634f56a0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4559,6 +4559,12 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
IsaPred<VPReductionPHIRecipe>);
+ // FIXME: implement interleaving for FindLast transform correctly.
+ for (auto &[_, RdxDesc] : Legal->getReductionVars())
+ if (RecurrenceDescriptor::isFindLastRecurrenceKind(
+ RdxDesc.getRecurrenceKind()))
+ return 1;
+
// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0) {
@@ -8475,6 +8481,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
*Plan, Builder))
return nullptr;
+ // Create whole-vector selects for find-last recurrences.
+ VPlanTransforms::runPass(VPlanTransforms::convertFindLastRecurrences, *Plan,
+ RecipeBuilder, Legal);
+
if (useActiveLaneMask(Style)) {
// TODO: Move checks to VPlanTransforms::addActiveLaneMask once
// TailFoldingStyle is visible there.
@@ -8569,6 +8579,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
RecurKind Kind = PhiR->getRecurrenceKind();
assert(
+ !RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) &&
!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
!RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
"AnyOf and FindIV reductions are not allowed for in-loop reductions");
@@ -8872,7 +8883,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
RecurKind RK = RdxDesc.getRecurrenceKind();
if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) &&
!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
- !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) {
+ !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) &&
+ !RecurrenceDescriptor::isFindLastRecurrenceKind(RK))) {
VPBuilder PHBuilder(Plan->getVectorPreheader());
VPValue *Iden = Plan->getOrAddLiveIn(
getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
@@ -9996,6 +10008,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Override IC if user provided an interleave count.
IC = UserIC > 0 ? UserIC : IC;
+ // FIXME: Enable interleaving for last_active reductions.
+ if (any_of(make_second_range(LVL.getReductionVars()), [&](auto &RdxDesc) {
+ return RecurrenceDescriptor::isFindLastRecurrenceKind(
+ RdxDesc.getRecurrenceKind());
+ })) {
+ LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
+ << "to conditional scalar assignments.\n");
+ IntDiagMsg = {
+ "ConditionalAssignmentPreventsScalarInterleaving",
+ "Unable to interleave without vectorization due to conditional "
+ "assignments"};
+ InterleaveLoop = false;
+ IC = 1;
+ }
+
// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1b55a3b235228..b588aec2c4236 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -25123,6 +25123,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
@@ -25264,6 +25265,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
@@ -25370,6 +25372,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1f10058ab4a9a..7cf026e7424f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1067,6 +1067,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
/// Returns the value for vscale.
VScale,
OpsEnd = VScale,
+ /// Extracts the last active lane based on a predicate vector operand.
+ ExtractLastActive,
};
/// Returns true if this VPInstruction generates scalar values for all lanes.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 80a2e4bc3f754..eada2c65bc820 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -118,7 +118,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return Type::getIntNTy(Ctx, 64);
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractLastLanePerPart:
- case VPInstruction::ExtractPenultimateElement: {
+ case VPInstruction::ExtractPenultimateElement:
+ case VPInstruction::ExtractLastActive: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
return VecTy->getElementType();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bde62dd6dd4bc..5a02d70191e93 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -546,6 +546,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ActiveLaneMask:
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ReductionStartVector:
+ case VPInstruction::ExtractLastActive:
return 3;
case VPInstruction::ComputeFindIVResult:
return 4;
@@ -999,6 +1000,17 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
case VPInstruction::ResumeForEpilogue:
return State.get(getOperand(0), true);
+ case VPInstruction::ExtractLastActive: {
+ Value *Data = State.get(getOperand(0));
+ Value *Mask = State.get(getOperand(1));
+ Value *Default = State.get(getOperand(2), /*IsScalar=*/true);
+ Type *VTy = Data->getType();
+
+ Module *M = State.Builder.GetInsertBlock()->getModule();
+ Function *ExtractLast = Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::experimental_vector_extract_last_active, {VTy});
+ return Builder.CreateCall(ExtractLast, {Data, Mask, Default});
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -1135,6 +1147,15 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
+ case VPInstruction::ExtractLastActive: {
+ Type *ScalarTy = Ctx.Types.inferScalarType(this);
+ Type *VecTy = toVectorTy(ScalarTy, VF);
+ Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+ IntrinsicCostAttributes ICA(
+ Intrinsic::experimental_vector_extract_last_active, ScalarTy,
+ {VecTy, MaskTy, ScalarTy});
+ return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
+ }
case VPInstruction::FirstOrderRecurrenceSplice: {
assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
SmallVector<int> Mask(VF.getKnownMinValue());
@@ -1191,6 +1212,7 @@ bool VPInstruction::isVectorToScalar() const {
getOpcode() == VPInstruction::FirstActiveLane ||
getOpcode() == VPInstruction::ComputeAnyOfResult ||
getOpcode() == VPInstruction::ComputeFindIVResult ||
+ getOpcode() == VPInstruction::ExtractLastActive ||
getOpcode() == VPInstruction::ComputeReductionResult ||
getOpcode() == VPInstruction::AnyOf;
}
@@ -1252,6 +1274,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::ActiveLaneMask:
case VPInstruction::FirstActiveLane:
+ case VPInstruction::ExtractLastActive:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::Not:
@@ -1437,6 +1460,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::Unpack:
O << "unpack";
break;
+ case VPInstruction::ExtractLastActive:
+ O << "extract-last-active";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 986c801abf684..e06b92cc3fa9b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -36,6 +36,7 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/TypeSize.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
using namespace llvm;
using namespace VPlanPatternMatch;
@@ -4573,3 +4574,85 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
}
}
}
+
+void VPlanTransforms::convertFindLastRecurrences(
+ VPlan &Plan, VPRecipeBuilder &RecipeBuilder,
+ LoopVectorizationLegality *Legal) {
+ assert(Legal && "Need valid LoopVecLegality");
+
+ // May need to do something better than this?
+ if (Plan.hasScalarVFOnly())
+ return;
+
+ // We want to create the following nodes:
+ // vec.body:
+ // mask.phi = phi <VF x i1> [ all.false, vec.ph ], [ new.mask, vec.body ]
+ // ...data.phi already exists, but needs updating...
+ // data.phi = phi <VF x Ty> [ default.val, vec.ph ], [ new.data, vec.body ]
+ //
+ // ...'data' and 'compare' created by existing nodes...
+ //
+ // any_active = i1 any_of_reduction(compare)
+ // new.mask = select any_active, compare, mask.phi
+ // new.data = select any_active, data, data.phi
+ //
+ // middle.block:
+ // ...the extract already exists, but needs updating...
+ // result = extract-last-active new.data, new.mask, default.val
+
+ for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
+ if (RecurrenceDescriptor::isFindLastRecurrenceKind(
+ RdxDesc.getRecurrenceKind())) {
+ VPRecipeBase *PhiR = RecipeBuilder.getRecipe(Phi);
+ VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR);
+
+ // Add mask phi...
+ VPValue *False =
+ Plan.getOrAddLiveIn(ConstantInt::getFalse(Phi->getContext()));
+ // FIXME: Either come up with a new phi recipe or make an existing one
+ // more generic. There's only supposed to be one ALM PHI.
+ VPActiveLaneMaskPHIRecipe *MaskPHI =
+ new VPActiveLaneMaskPHIRecipe(False, DebugLoc());
+ Builder.insert(MaskPHI);
+
+ SelectInst *Select = cast<SelectInst>(RdxDesc.getLoopExitInstr());
+ auto *SR = cast<VPWidenSelectRecipe>(RecipeBuilder.getRecipe(Select));
+
+ // Add select for mask...
+ VPValue *Cond = SR->getCond();
+ Builder.setInsertPoint(SR);
+ VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond});
+ VPValue *SplatAnyOf =
+ Builder.createNaryOp(VPInstruction::Broadcast, AnyOf);
+ VPValue *MaskSelect = Builder.createSelect(SplatAnyOf, Cond, MaskPHI);
+ MaskPHI->addOperand(MaskSelect);
+
+ // Create new select for data...
+ VPValue *DataSelect = Builder.createSelect(SplatAnyOf, SR->getOperand(1),
+ SR->getOperand(2));
+ SR->replaceAllUsesWith(DataSelect);
+ SR->eraseFromParent();
+
+ // Find final reduction and replace it with an
+ // extract.last.active intrinsic.
+ VPInstruction *RdxResult = nullptr;
+ for (VPUser *U : DataSelect->users()) {
+ VPInstruction *I = dyn_cast<VPInstruction>(U);
+ if (I && I->getOpcode() == VPInstruction::ComputeReductionResult) {
+ RdxResult = I;
+ break;
+ }
+ }
+
+ assert(RdxResult);
+ Builder.setInsertPoint(RdxResult);
+ VPValue *Default = RecipeBuilder.getVPValueOrAddLiveIn(
+ RdxDesc.getRecurrenceStartValue());
+ auto *ExtractLastActive = Builder.createNaryOp(
+ VPInstruction::ExtractLastActive, {DataSelect, MaskSelect, Default},
+ RdxResult->getDebugLoc());
+ RdxResult->replaceAllUsesWith(ExtractLastActive);
+ RdxResult->eraseFromParent();
+ }
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index b28559b620e13..71aa6f9457c1c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -23,6 +23,7 @@ namespace llvm {
class InductionDescriptor;
class Instruction;
+class LoopVectorizationLegality;
class PHINode;
class ScalarEvolution;
class PredicatedScalarEvolution;
@@ -377,6 +378,13 @@ struct VPlanTransforms {
/// users in the original exit block using the VPIRInstruction wrapping to the
/// LCSSA phi.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);
+
+ /// Change FindLast reductions to save the appropriate state using selects
+ /// for entire vectors for both the latest mask containing at least one active
+ /// element and the corresponding data vector.
+ static void convertFindLastRecurrences(VPlan &Plan,
+ VPRecipeBuilder &RecipeBuilder,
+ LoopVectorizationLegality *Legal);
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
new file mode 100644
index 0000000000000..5c7ff36d21b79
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -passes=loop-vectorize,instcombine -S < %s 2>&1 | FileCheck %s --check-prefix=NEON
+; RUN: opt -passes=loop-vectorize,instcombine -mattr=+sve -S < %s 2>&1 | FileCheck %s --check-prefix=SVE
+
+target triple = "aarch64-linux-gnu"
+
+define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) {
+; NEON-LABEL: define i32 @simple_csa_int_select(
+; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) {
+; NEON-NEXT: [[ENTRY:.*]]:
+; NEON-NEXT: br label %[[LOOP:.*]]
+; NEON: [[LOOP]]:
+; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[T_010:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
+; NEON-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]]
+; NEON-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NEON-NEXT: [[CMP1:%.*]] = icmp slt i32 [[A]], [[TMP7]]
+; NEON-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP7]], i32 [[T_010]]
+; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NEON-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NEON: [[EXIT]]:
+; NEON-NEXT: ret i32 [[SPEC_SELECT]]
+;
+; SVE-LABEL: define i32 @simple_csa_int_select(
+; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; SVE-NEXT: [[ENTRY:.*]]:
+; SVE-NEXT: [[A_FR:%.*]] = freeze i32 [[A]]
+; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; SVE: [[VECTOR_PH]]:
+; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[A_FR]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SVE-NEXT: br label %[[VECTOR_BODY:.*]]
+; SVE: [[VECTOR_BODY]]:
+; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]]
+; SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
+; SVE-NEXT: [[WIDE_LOAD_FR:%.*]] = freeze <vscale x 4 x i32> [[WIDE_LOAD]]
+; SVE-NEXT: [[TMP5:%.*]] = icmp slt <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD_FR]]
+; SVE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP5]])
+; SVE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP9]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; SVE-NEXT: [[TMP7]] = select <vscale x 4 x i1> [[BROADCAST_SPLAT2]], <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]]
+; SVE-NEXT: [[TMP8]] = select <vscale x 4 x i1> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[WIDE_LOAD_FR]], <vscale x 4 x i32> [[VEC_PHI]]
+; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; SVE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SVE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SVE: [[MIDDLE_BLOCK]]:
+; SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP8]], <vscale x 4 x i1> [[TMP7]], i32 -1)
+; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; SVE: [[SCALAR_PH]]:
+; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
+; SVE-NEXT: br label %[[LOOP:.*]]
+; SVE: [[LOOP]]:
+; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[T_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
+; SVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
+; SVE-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; SVE-NEXT: [[CMP1:%.*]] = icmp slt i32 [[A_FR]], [[TMP13]]
+; SVE-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP13]], i32 [[T_010]]
+; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; SVE-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; SVE: [[EXIT]]:
+; SVE-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; SVE-NEXT: ret i32 [[SPEC_SELECT_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ]
+ %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
+ %ld = load i32, ptr %ld.addr, align 4
+ %select.cmp = icmp slt i32 %a, %ld
+ %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cmp = icmp eq i64 %iv.next, %N
+ br i1 %exit.cmp, label %exit, label %loop
+
+exit:
+ ret i32 %select.data
+}
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
new file mode 100644
index 0000000000000..447fea5feb8aa
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
@@ -0,0 +1,125 @@
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -scalable-vectorization=on -force-target-supports-scalable-vectors \
+; RUN: -disable-output 2>&1 < %s | FileCheck %s
+
+
+; This function is derived from the following C program:
+; int simple_csa_int_select(int N, int *data, int a) {
+; int t = -1;
+; for (int i = 0; i < N; i++) {
+; if (a < data[i])
+; t = data[i];
+; }
+; return t;
+; }
+define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) {
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ]
+ %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
+ %ld = load i32, ptr %ld.addr, align 4
+ %select.cmp = icmp slt i32 %a, %ld
+ %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cmp = icmp eq i64 %iv.next, %N
+ br i1 %exit.cmp, label %exit, label %loop
+
+exit:
+ ret i32 %select.data
+}
+
+
+; CHECK: VPlan 'Initial VPlan for VF={vscale x 1},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF
+; CHECK-NEXT: Live-in vp<%1> = VF * UF
+; CHECK-NEXT: Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT: Live-in ir<%N> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%10>
+; CHECK-NEXT: ACTIVE-LANE-MASK-PHI vp<%4> = phi ir<false>, vp<%9>
+; CHECK-NEXT: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0>
+; CHECK-NEXT: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5>
+; CHECK-NEXT: vp<%6> = vector-pointer ir<%ld.addr>
+; CHECK-NEXT: WIDEN ir<%ld> = load vp<%6>
+; CHECK-NEXT: WIDEN ir<%select.cmp> = icmp slt ir<%a>, ir<%ld>
+; CHECK-NEXT: EMIT vp<%7> = any-of ir<%select.cmp>
+; CHECK-NEXT: EMIT vp<%8> = broadcast vp<%7>
+; CHECK-NEXT: EMIT vp<%9> = select vp<%8>, ir<%select.cmp>, vp<%4>
+; CHECK-NEXT: EMIT vp<%10> = select vp<%8>, ir<%ld>, ir<%data.phi>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1>
+; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<%12> = extract-last-active vp<%10>, vp<%9>, ir<-1>
+; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%N>, vp<%2>
+; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT: IR %select.data.lcssa = phi i32 [ %select.data, %loop ] (extra operand: vp<%12> from middle.block)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%12>, middle.block ], [ ir<-1>, ir-bb<entry> ]
+; CHECK-NEXT: Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop>:
+; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT: IR %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT: IR %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
+; CHECK-NEXT: IR %ld = load i32, ptr %ld.addr, align 4
+; CHECK-NEXT: IR %select.cmp = icmp slt i32 %a, %ld
+; CHECK-NEXT: IR %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
+; CHECK-NEXT: IR %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT: IR %exit.cmp = icmp eq i64 %iv.next, %N
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+
+; CHECK: Cost of 1 for VF vscale x 1: induction instruction %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: exit condition instruction %exit.cmp = icmp eq i64 %iv.next, %N
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%10>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: ACTIVE-LANE-MASK-PHI vp<%4> = phi ir<false>, vp<%9>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%6> = vector-pointer ir<%ld.addr>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<%ld> = load vp<%6>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<%select.cmp> = icmp slt ir<%a>, ir<%ld>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%7> = any-of ir<%select.cmp>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%8> = broadcast vp<%7>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%9> = select vp<%8>, ir<%select.cmp>, vp<%4>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%10> = select vp<%8>, ir<%ld>, ir<%data.phi>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: vector loop backedge
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%12>, middle.block ], [ ir<-1>, ir-bb<entry> ]
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %ld = load i32, ptr %ld.addr, align 4
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.cmp = icmp slt i32 %a, %ld
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %exit.cmp = icmp eq i64 %iv.next, %N
+; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%12> = extract-last-active vp<%10>, vp<%9>, ir<-1>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%cmp.n> = icmp eq ir<%N>, vp<%2>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.data.lcssa = phi i32 [ %select.data, %loop ] (extra operand: vp<%12> from middle.block)
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
index 70b1ea13677b8..d133626b5b5bc 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
@@ -965,25 +965,141 @@ exit: ; preds = %loop
; In this test, %iv's range will include both signed and unsigned
; maximum (sentinel) values.
define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start) {
-; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1
-; CHECK-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]]
-; CHECK-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1
-; CHECK-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]]
-; CHECK-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1
-; CHECK-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]]
-; CHECK-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
-; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0
-; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
-; CHECK-NEXT: ret i64 [[COND_LCSSA]]
+; IC1VF4-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(
+; IC1VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) {
+; IC1VF4-NEXT: [[ENTRY:.*:]]
+; IC1VF4-NEXT: br label %[[VECTOR_PH:.*]]
+; IC1VF4: [[VECTOR_PH]]:
+; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; IC1VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC1VF4: [[VECTOR_BODY]]:
+; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -1, i64 -2, i64 -3, i64 -4>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
+; IC1VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; IC1VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
+; IC1VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; IC1VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -3
+; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -3
+; IC1VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
+; IC1VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC1VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]]
+; IC1VF4-NEXT: [[TMP9:%.*]] = freeze <4 x i1> [[TMP8]]
+; IC1VF4-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP9]])
+; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i1> poison, i1 [[TMP10]], i64 0
+; IC1VF4-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT3]], <4 x i1> poison, <4 x i32> zeroinitializer
+; IC1VF4-NEXT: [[TMP11]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i1> [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; IC1VF4-NEXT: [[TMP12]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]]
+; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC1VF4-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4
+; IC1VF4-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IC1VF4: [[MIDDLE_BLOCK]]:
+; IC1VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; IC1VF4-NEXT: [[TMP15:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP12]], <4 x i1> [[TMP11]], i64 [[TMP14]])
+; IC1VF4-NEXT: br label %[[SCALAR_PH:.*]]
+; IC1VF4: [[SCALAR_PH]]:
+; IC1VF4-NEXT: br label %[[LOOP:.*]]
+; IC1VF4: [[LOOP]]:
+; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[TMP15]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ]
+; IC1VF4-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1
+; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]]
+; IC1VF4-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1
+; IC1VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]]
+; IC1VF4-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1
+; IC1VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]]
+; IC1VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; IC1VF4: [[EXIT]]:
+; IC1VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
+; IC1VF4-NEXT: ret i64 [[COND_LCSSA]]
+;
+; IC4VF4-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(
+; IC4VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) {
+; IC4VF4-NEXT: [[ENTRY:.*:]]
+; IC4VF4-NEXT: br label %[[VECTOR_PH:.*]]
+; IC4VF4: [[VECTOR_PH]]:
+; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; IC4VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC4VF4: [[VECTOR_BODY]]:
+; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -1, i64 -2, i64 -3, i64 -4>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
+; IC4VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; IC4VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
+; IC4VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; IC4VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -3
+; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -3
+; IC4VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
+; IC4VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]]
+; IC4VF4-NEXT: [[TMP9:%.*]] = freeze <4 x i1> [[TMP8]]
+; IC4VF4-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP9]])
+; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i1> poison, i1 [[TMP10]], i64 0
+; IC4VF4-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT3]], <4 x i1> poison, <4 x i32> zeroinitializer
+; IC4VF4-NEXT: [[TMP11]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i1> [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; IC4VF4-NEXT: [[TMP12]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]]
+; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC4VF4-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4
+; IC4VF4-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IC4VF4: [[MIDDLE_BLOCK]]:
+; IC4VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; IC4VF4-NEXT: [[TMP15:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP12]], <4 x i1> [[TMP11]], i64 [[TMP14]])
+; IC4VF4-NEXT: br label %[[SCALAR_PH:.*]]
+; IC4VF4: [[SCALAR_PH]]:
+; IC4VF4-NEXT: br label %[[LOOP:.*]]
+; IC4VF4: [[LOOP]]:
+; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[TMP15]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ]
+; IC4VF4-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1
+; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]]
+; IC4VF4-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1
+; IC4VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]]
+; IC4VF4-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1
+; IC4VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]]
+; IC4VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; IC4VF4: [[EXIT]]:
+; IC4VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
+; IC4VF4-NEXT: ret i64 [[COND_LCSSA]]
+;
+; IC4VF1-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(
+; IC4VF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) {
+; IC4VF1-NEXT: [[ENTRY:.*]]:
+; IC4VF1-NEXT: br label %[[LOOP:.*]]
+; IC4VF1: [[LOOP]]:
+; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1
+; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]]
+; IC4VF1-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1
+; IC4VF1-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]]
+; IC4VF1-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1
+; IC4VF1-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]]
+; IC4VF1-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; IC4VF1: [[EXIT]]:
+; IC4VF1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
+; IC4VF1-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
br label %loop
@@ -1006,25 +1122,167 @@ exit:
}
define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
-; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ]
-; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[RDX_START]], %[[ENTRY]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
-; CHECK-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]]
-; CHECK-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
-; CHECK-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]]
-; CHECK-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8
-; CHECK-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]]
-; CHECK-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
-; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1
-; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT:.*]]
-; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
-; CHECK-NEXT: ret i64 [[COND_LCSSA]]
+; IC1VF4-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(
+; IC1VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; IC1VF4-NEXT: [[ENTRY:.*]]:
+; IC1VF4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
+; IC1VF4-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1)
+; IC1VF4-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]]
+; IC1VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4
+; IC1VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC1VF4: [[VECTOR_PH]]:
+; IC1VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
+; IC1VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; IC1VF4-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]]
+; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; IC1VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; IC1VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IC1VF4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 -1, i64 -2, i64 -3>
+; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC1VF4: [[VECTOR_BODY]]:
+; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1)
+; IC1VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
+; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC1VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; IC1VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
+; IC1VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 -3
+; IC1VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
+; IC1VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC1VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]]
+; IC1VF4-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP11]]
+; IC1VF4-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]])
+; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i1> poison, i1 [[TMP13]], i64 0
+; IC1VF4-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT5]], <4 x i1> poison, <4 x i32> zeroinitializer
+; IC1VF4-NEXT: [[TMP14]] = select <4 x i1> [[BROADCAST_SPLAT6]], <4 x i1> [[TMP11]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; IC1VF4-NEXT: [[TMP15]] = select <4 x i1> [[BROADCAST_SPLAT6]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
+; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC1VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC1VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IC1VF4: [[MIDDLE_BLOCK]]:
+; IC1VF4-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; IC1VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP15]], <4 x i1> [[TMP14]], i64 [[TMP17]])
+; IC1VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; IC1VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC1VF4: [[SCALAR_PH]]:
+; IC1VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ]
+; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; IC1VF4-NEXT: br label %[[LOOP:.*]]
+; IC1VF4: [[LOOP]]:
+; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; IC1VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]]
+; IC1VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
+; IC1VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]]
+; IC1VF4-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8
+; IC1VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]]
+; IC1VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1
+; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; IC1VF4: [[EXIT]]:
+; IC1VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ]
+; IC1VF4-NEXT: ret i64 [[COND_LCSSA]]
+;
+; IC4VF4-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(
+; IC4VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; IC4VF4-NEXT: [[ENTRY:.*]]:
+; IC4VF4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
+; IC4VF4-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1)
+; IC4VF4-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]]
+; IC4VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4
+; IC4VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC4VF4: [[VECTOR_PH]]:
+; IC4VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
+; IC4VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; IC4VF4-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]]
+; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; IC4VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; IC4VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IC4VF4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 -1, i64 -2, i64 -3>
+; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC4VF4: [[VECTOR_BODY]]:
+; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1)
+; IC4VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
+; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; IC4VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
+; IC4VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 -3
+; IC4VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
+; IC4VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]]
+; IC4VF4-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP11]]
+; IC4VF4-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]])
+; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i1> poison, i1 [[TMP13]], i64 0
+; IC4VF4-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT5]], <4 x i1> poison, <4 x i32> zeroinitializer
+; IC4VF4-NEXT: [[TMP14]] = select <4 x i1> [[BROADCAST_SPLAT6]], <4 x i1> [[TMP11]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; IC4VF4-NEXT: [[TMP15]] = select <4 x i1> [[BROADCAST_SPLAT6]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
+; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC4VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC4VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IC4VF4: [[MIDDLE_BLOCK]]:
+; IC4VF4-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; IC4VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP15]], <4 x i1> [[TMP14]], i64 [[TMP17]])
+; IC4VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; IC4VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC4VF4: [[SCALAR_PH]]:
+; IC4VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ]
+; IC4VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; IC4VF4-NEXT: br label %[[LOOP:.*]]
+; IC4VF4: [[LOOP]]:
+; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; IC4VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]]
+; IC4VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
+; IC4VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]]
+; IC4VF4-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8
+; IC4VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]]
+; IC4VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1
+; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; IC4VF4: [[EXIT]]:
+; IC4VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ]
+; IC4VF4-NEXT: ret i64 [[COND_LCSSA]]
+;
+; IC4VF1-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(
+; IC4VF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
+; IC4VF1-NEXT: [[ENTRY:.*]]:
+; IC4VF1-NEXT: br label %[[LOOP:.*]]
+; IC4VF1: [[LOOP]]:
+; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ]
+; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; IC4VF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]]
+; IC4VF1-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
+; IC4VF1-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]]
+; IC4VF1-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8
+; IC4VF1-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]]
+; IC4VF1-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
+; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1
+; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; IC4VF1: [[EXIT]]:
+; IC4VF1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
+; IC4VF1-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
br label %loop
@@ -1045,3 +1303,5 @@ loop: ; preds = %entry, %loop
exit: ; preds = %loop
ret i64 %cond
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
index c958ea7b9b88e..8d79bdc96fab8 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
@@ -145,10 +145,46 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) {
; CHECK-LABEL: define i64 @select_icmp_nuw(
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[II]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i1> poison, i1 [[TMP4]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT2]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i1> [[TMP2]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP6]], <4 x i1> [[TMP5]], i64 [[TMP8]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ]
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[II]], %[[ENTRY]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
@@ -157,9 +193,9 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) {
; CHECK-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
; CHECK-NEXT: [[INC]] = add nuw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
@@ -186,10 +222,46 @@ define i64 @select_icmp_noflag(ptr %a, ptr %b, i64 %ii, i64 %n) {
; CHECK-LABEL: define i64 @select_icmp_noflag(
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[II]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i1> poison, i1 [[TMP4]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT2]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i1> [[TMP2]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP6]], <4 x i1> [[TMP5]], i64 [[TMP8]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ]
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[II]], %[[ENTRY]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
@@ -198,9 +270,9 @@ define i64 @select_icmp_noflag(ptr %a, ptr %b, i64 %ii, i64 %n) {
; CHECK-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
; CHECK-NEXT: [[INC]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
@@ -229,4 +301,8 @@ exit: ; preds = %for.body
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
index 72ed6537ef640..b1b87fdbdcc1e 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
@@ -1,32 +1,160 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC4
define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %iv_start ,i64 %n) {
-; CHECK-LABEL: define i64 @select_non_const_iv_start_signed_guard(
-; CHECK-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]]
-; CHECK-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
-; CHECK: [[FOR_BODY_PREHEADER]]:
-; CHECK-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3
-; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
-; CHECK: [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT: br label %[[EXIT]]
-; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
-; CHECK-NEXT: ret i64 [[IDX_0_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i64 @select_non_const_iv_start_signed_guard(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]]
+; CHECK-VF4IC1-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_START]]
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3)
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i1> poison, i1 [[TMP5]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT3]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[TMP6]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i1> [[TMP3]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP7]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]])
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1: [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP11]], 3
+; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]]
+; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]:
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT: br label %[[EXIT]]
+; CHECK-VF4IC1: [[EXIT]]:
+; CHECK-VF4IC1-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF4IC1-NEXT: ret i64 [[IDX_0_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i64 @select_non_const_iv_start_signed_guard(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]]
+; CHECK-VF4IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_START]]
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3)
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i1> poison, i1 [[TMP5]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT3]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP6]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i1> [[TMP3]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP7]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]])
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4: [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP11]], 3
+; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]]
+; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]:
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT: br label %[[EXIT]]
+; CHECK-VF4IC4: [[EXIT]]:
+; CHECK-VF4IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF4IC4-NEXT: ret i64 [[IDX_0_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i64 @select_non_const_iv_start_signed_guard(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]]
+; CHECK-VF1IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF1IC4: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4: [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF1IC4-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3
+; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]]
+; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4: [[EXIT_LOOPEXIT]]:
+; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: br label %[[EXIT]]
+; CHECK-VF1IC4: [[EXIT]]:
+; CHECK-VF1IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF1IC4-NEXT: ret i64 [[IDX_0_LCSSA]]
;
entry:
%guard = icmp slt i64 %iv_start, %n
@@ -49,32 +177,166 @@ exit:
}
define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start, i32 %iv_start ,i32 %n) {
-; CHECK-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard(
-; CHECK-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]]
-; CHECK-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
-; CHECK: [[FOR_BODY_PREHEADER]]:
-; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64
-; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64
-; CHECK-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT: [[RDX_07:%.*]] = phi i32 [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 3
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[RDX_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
-; CHECK: [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ]
-; CHECK-NEXT: br label %[[EXIT]]
-; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
-; CHECK-NEXT: ret i32 [[IDX_0_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]]
+; CHECK-VF4IC1-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64
+; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64
+; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[RDX_START]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT3]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[TMP7]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i1> [[TMP4]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP8]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP8]], <4 x i1> [[TMP7]], i32 [[TMP10]])
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1: [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[RDX_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP12]], 3
+; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP13]], i32 [[RDX_07]]
+; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]:
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT: br label %[[EXIT]]
+; CHECK-VF4IC1: [[EXIT]]:
+; CHECK-VF4IC1-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF4IC1-NEXT: ret i32 [[IDX_0_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]]
+; CHECK-VF4IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64
+; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64
+; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[RDX_START]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT3]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP7]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i1> [[TMP4]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP8]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP8]], <4 x i1> [[TMP7]], i32 [[TMP10]])
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4: [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[RDX_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP12]], 3
+; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP13]], i32 [[RDX_07]]
+; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]:
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT: br label %[[EXIT]]
+; CHECK-VF4IC4: [[EXIT]]:
+; CHECK-VF4IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF4IC4-NEXT: ret i32 [[IDX_0_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]]
+; CHECK-VF1IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-VF1IC4: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64
+; CHECK-VF1IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64
+; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4: [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[RDX_07:%.*]] = phi i32 [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 3
+; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[RDX_07]]
+; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4: [[EXIT_LOOPEXIT]]:
+; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: br label %[[EXIT]]
+; CHECK-VF1IC4: [[EXIT]]:
+; CHECK-VF1IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-VF1IC4-NEXT: ret i32 [[IDX_0_LCSSA]]
;
entry:
%guard = icmp slt i32 %iv_start, %n
@@ -101,3 +363,18 @@ exit:
%idx.0.lcssa = phi i32 [ %rdx_start, %entry ], [ %cond, %for.body ]
ret i32 %idx.0.lcssa
}
+;.
+; CHECK-VF4IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF4IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF4IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF4IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
+; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF4IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF4IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
index b991d58eb2b8d..5b745dadc5a73 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
@@ -688,20 +688,51 @@ define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(ptr %a,
; CHECK-VF4IC1-NEXT: [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0
; CHECK-VF4IC1-NEXT: br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[TMP4]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331)
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32
; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
-; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]:
-; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: br label %[[EXIT]]
; CHECK-VF4IC1: [[EXIT]]:
; CHECK-VF4IC1-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
@@ -713,20 +744,51 @@ define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(ptr %a,
; CHECK-VF4IC4-NEXT: [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0
; CHECK-VF4IC4-NEXT: br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP4]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP5]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331)
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32
; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
-; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]:
-; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: br label %[[EXIT]]
; CHECK-VF4IC4: [[EXIT]]:
; CHECK-VF4IC4-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
@@ -793,20 +855,51 @@ define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(pt
; CHECK-VF4IC1-NEXT: br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]]
; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]:
; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[TMP4]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331)
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32
; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]]
-; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]:
-; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: br label %[[EXIT]]
; CHECK-VF4IC1: [[EXIT]]:
; CHECK-VF4IC1-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
@@ -819,20 +912,51 @@ define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(pt
; CHECK-VF4IC4-NEXT: br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]]
; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]:
; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP4]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP5]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331)
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32
; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]]
-; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]:
-; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: br label %[[EXIT]]
; CHECK-VF4IC4: [[EXIT]]:
; CHECK-VF4IC4-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
@@ -899,41 +1023,65 @@ exit: ; preds = %for.body, %entry
define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) {
; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(
; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) {
-; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
-; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]]
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 -2, i32 -1, i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = add i64 4294967294, [[INDEX]]
; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-VF4IC1-NEXT: [[CONV:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
-; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[TMP4]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512
+; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331)
+; CHECK-VF4IC1-NEXT: br label %[[EXIT:.*]]
; CHECK-VF4IC1: [[EXIT]]:
-; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF4IC1-NEXT: ret i32 [[TMP7]]
;
; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) {
-; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
-; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ENTRY:.*:]]
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 -2, i32 -1, i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = add i64 4294967294, [[INDEX]]
; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-VF4IC4-NEXT: [[CONV:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
-; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP4]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP5]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512
+; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331)
+; CHECK-VF4IC4-NEXT: br label %[[EXIT:.*]]
; CHECK-VF4IC4: [[EXIT]]:
-; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF4IC4-NEXT: ret i32 [[TMP7]]
;
; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(
; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) {
@@ -980,44 +1128,116 @@ define i32 @not_vectorized_select_iv_icmp_no_guard(ptr %a, ptr %b, i32 %start, i
; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[START:%.*]], i32 [[N:%.*]]) {
; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]]
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i1> poison, i1 [[TMP4]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT2]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i1> [[TMP11]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP6]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 [[TMP8]])
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]]
; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
-; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i32
; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[RDX]]
-; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK-VF4IC1: [[EXIT]]:
-; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: ret i32 [[COND_LCSSA]]
;
; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_iv_icmp_no_guard(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[START:%.*]], i32 [[N:%.*]]) {
; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]]
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i1> poison, i1 [[TMP4]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT2]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP5]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i1> [[TMP11]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP6]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 [[TMP8]])
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]]
; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
-; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i32
; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[RDX]]
-; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK-VF4IC4: [[EXIT]]:
-; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: ret i32 [[COND_LCSSA]]
;
; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_iv_icmp_no_guard(
@@ -1071,38 +1291,88 @@ exit: ; preds = %for.body
define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) {
; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub(
; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) {
-; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]]
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[TMP4]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
+; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 -1)
+; CHECK-VF4IC1-NEXT: br label %[[SCALAR_PH:.*]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ 2147483648, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[TMP7]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00
-; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32
; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
-; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK-VF4IC1: [[EXIT]]:
; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
; CHECK-VF4IC1-NEXT: ret i32 [[SPEC_SELECT_LCSSA]]
;
; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) {
-; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT: [[ENTRY:.*:]]
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP4]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP5]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
+; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 -1)
+; CHECK-VF4IC4-NEXT: br label %[[SCALAR_PH:.*]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ 2147483648, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[TMP7]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00
-; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32
; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]]
-; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK-VF4IC4: [[EXIT]]:
; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
; CHECK-VF4IC4-NEXT: ret i32 [[SPEC_SELECT_LCSSA]]
@@ -1156,22 +1426,58 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p
; CHECK-VF4IC1-NEXT: br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]:
; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[START]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i16> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]]
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i1> poison, i1 [[TMP4]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT2]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i1> [[TMP11]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP6]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> [[TMP6]], <4 x i1> [[TMP5]], i16 [[TMP8]])
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i16 [ [[START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]]
; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
; CHECK-VF4IC1-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
-; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i16
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i16
; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP3]], i16 [[TMP2]], i16 [[RDX]]
-; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]:
-; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: br label %[[EXIT]]
; CHECK-VF4IC1: [[EXIT]]:
; CHECK-VF4IC1-NEXT: [[RDX_0_LCSSA:%.*]] = phi i16 [ [[START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
@@ -1184,22 +1490,58 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p
; CHECK-VF4IC4-NEXT: br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]:
; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[START]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i16> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]]
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i1> poison, i1 [[TMP4]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT2]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP5]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i1> [[TMP11]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP6]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> [[TMP6]], <4 x i1> [[TMP5]], i16 [[TMP8]])
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i16 [ [[START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]]
; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
; CHECK-VF4IC4-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]]
-; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i16
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i16
; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP3]], i16 [[TMP2]], i16 [[RDX]]
-; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]:
-; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: br label %[[EXIT]]
; CHECK-VF4IC4: [[EXIT]]:
; CHECK-VF4IC4-NEXT: [[RDX_0_LCSSA:%.*]] = phi i16 [ [[START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
index 91c83103bf8f2..656c0031c93d9 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
@@ -1948,16 +1948,54 @@ exit: ; preds = %for.body
}
; The sentinel value for increasing-IV vectorization is -LONG_MAX, and since
-; the IV hits this value, it is impossible to vectorize this case.
+; the IV hits this value, it is vectorized as a generic last-active reduction.
define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound(
; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = add i64 -9223372036854775808, [[N_VEC]]
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775807, i64 -9223372036854775806, i64 -9223372036854775805>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i1> poison, i1 [[TMP5]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT2]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[TMP6]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i1> [[TMP3]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP7]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]])
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL4]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
@@ -1967,19 +2005,57 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV_I]], 1
; CHECK-VF4IC1-NEXT: [[INC3]] = add nsw i64 [[IV_J]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-VF4IC1: [[EXIT]]:
-; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: ret i64 [[COND_LCSSA]]
;
; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = add i64 -9223372036854775808, [[N_VEC]]
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775807, i64 -9223372036854775806, i64 -9223372036854775805>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i1> poison, i1 [[TMP5]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT2]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP6]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i1> [[TMP3]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP7]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]])
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL4]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
@@ -1989,9 +2065,9 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV_I]], 1
; CHECK-VF4IC4-NEXT: [[INC3]] = add nsw i64 [[IV_J]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-VF4IC4: [[EXIT]]:
-; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: ret i64 [[COND_LCSSA]]
;
; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound(
@@ -2042,10 +2118,52 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value(
; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[IVSTART]]
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP12]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP12]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP12]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = add i64 [[IVSTART]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IVSTART]], [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT4]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[TMP7]] = select <4 x i1> [[BROADCAST_SPLAT5]], <4 x i1> [[TMP4]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP8]] = select <4 x i1> [[BROADCAST_SPLAT5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP8]], <4 x i1> [[TMP7]], i64 [[TMP10]])
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
@@ -2054,18 +2172,60 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK-VF4IC1: [[EXIT]]:
-; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: ret i64 [[COND_LCSSA]]
;
; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[IVSTART]]
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP12]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP12]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP12]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = add i64 [[IVSTART]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IVSTART]], [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT4]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP7]] = select <4 x i1> [[BROADCAST_SPLAT5]], <4 x i1> [[TMP4]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP8]] = select <4 x i1> [[BROADCAST_SPLAT5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP8]], <4 x i1> [[TMP7]], i64 [[TMP10]])
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
@@ -2074,9 +2234,9 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK-VF4IC4: [[EXIT]]:
-; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: ret i64 [[COND_LCSSA]]
;
; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value(
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
index e4922d3e4f627..110b39f8497d3 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -1128,27 +1128,128 @@ exit: ; preds = %loop
ret float %sel
}
-; We don't support selecting loop-variant values.
define i32 @select_variant_i32_from_icmp(ptr %v1, ptr %v2, i64 %n) {
-; CHECK-LABEL: define i32 @select_variant_i32_from_icmp(
-; CHECK-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ 3, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]]
-; CHECK-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4
-; CHECK-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]]
-; CHECK-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4
-; CHECK-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3
-; CHECK-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
-; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ]
-; CHECK-NEXT: ret i32 [[SEL_LCSSA]]
+; CHECK-VF4IC1-LABEL: define i32 @select_variant_i32_from_icmp(
+; CHECK-VF4IC1-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1: [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1: [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 3), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[INDEX]]
+; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP4]], i64 0
+; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP2]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC1-NEXT: [[TMP6]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD1]]
+; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 3)
+; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1: [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: br label %[[LOOP:.*]]
+; CHECK-VF4IC1: [[LOOP]]:
+; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4
+; CHECK-VF4IC1-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4
+; CHECK-VF4IC1-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3
+; CHECK-VF4IC1-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]]
+; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-VF4IC1: [[EXIT]]:
+; CHECK-VF4IC1-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT: ret i32 [[SEL_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @select_variant_i32_from_icmp(
+; CHECK-VF4IC4-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4: [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4: [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 3), %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD9]], splat (i32 3)
+; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP11]]
+; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP19]], i64 0
+; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT: [[TMP23]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP11]], <4 x i1> [[ACTIVE_LANE_MASK]]
+; CHECK-VF4IC4-NEXT: [[TMP27]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD13]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP27]], <4 x i1> [[TMP23]], i32 3)
+; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4: [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: br label %[[LOOP:.*]]
+; CHECK-VF4IC4: [[LOOP]]:
+; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4
+; CHECK-VF4IC4-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4
+; CHECK-VF4IC4-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3
+; CHECK-VF4IC4-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]]
+; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-VF4IC4: [[EXIT]]:
+; CHECK-VF4IC4-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT: ret i32 [[SEL_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @select_variant_i32_from_icmp(
+; CHECK-VF1IC4-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT: br label %[[LOOP:.*]]
+; CHECK-VF1IC4: [[LOOP]]:
+; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ 3, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4
+; CHECK-VF1IC4-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4
+; CHECK-VF1IC4-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3
+; CHECK-VF1IC4-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]]
+; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-VF1IC4: [[EXIT]]:
+; CHECK-VF1IC4-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT: ret i32 [[SEL_LCSSA]]
;
entry:
br label %loop
@@ -1220,6 +1321,8 @@ exit: ; preds = %loop
; CHECK-VF4IC1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
; CHECK-VF4IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
; CHECK-VF4IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
;.
; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -1235,6 +1338,8 @@ exit: ; preds = %loop
; CHECK-VF4IC4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
; CHECK-VF4IC4: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
; CHECK-VF4IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
;.
; CHECK-VF1IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK-VF1IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
More information about the llvm-commits
mailing list