[llvm] [VPlan] Optimize FindLast of FindIV w/o sentinel. (PR #172569)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 12 10:52:42 PST 2026
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/172569
>From 6f7fa51e31fa57a32ccb60980411a4816bac7c5f Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 21 Jan 2026 10:40:29 +0000
Subject: [PATCH 1/2] [VPlan] Optimize FindLast of FindIV w/o sentinel.
For FindLast reduction selecting an IV, we can avoid the horizontal
AnyOf in the vector loop, by introducing an independent boolean
reduction to track if the condition was ever true in the loop. If it was
never true in the loop, we select the start value, otherwise the select
the min/max of the FindIV reduction, as required by the predicate.
The main advantage of this approach is that we have 2 independent
reductions, that do not require a horizontal AnyOf reduction in the
loop.
Currently this requires a non-wrapping IV, but this can be relaxed in
the future by selecting a canonical IV, which is then transformed to the
specific derived IV for the reduction after the loop.
Depends on https://github.com/llvm/llvm-project/pull/177870.
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 75 +++-
.../LoopVectorize/iv-select-cmp-decreasing.ll | 188 +++++++--
.../LoopVectorize/iv-select-cmp-no-wrap.ll | 22 +-
.../iv-select-cmp-non-const-iv-start.ll | 156 ++++++--
.../Transforms/LoopVectorize/iv-select-cmp.ll | 357 ++++++++++++++----
5 files changed, 627 insertions(+), 171 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6e19c944fe186..a595b68567677 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5497,12 +5497,12 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
PhiR->getRecurrenceKind()))
continue;
- // Get the IV from the backedge value of the reduction phi.
+ // Get the IV and condition from the backedge value of the reduction phi.
// The backedge value should be a select between the phi and the IV.
VPValue *BackedgeVal = PhiR->getBackedgeValue();
- VPValue *TrueVal, *FalseVal;
- if (!match(BackedgeVal,
- m_Select(m_VPValue(), m_VPValue(TrueVal), m_VPValue(FalseVal))))
+ VPValue *Cond, *TrueVal, *FalseVal;
+ if (!match(BackedgeVal, m_Select(m_VPValue(Cond), m_VPValue(TrueVal),
+ m_VPValue(FalseVal))))
continue;
// The non-phi operand of the select is the IV.
@@ -5526,36 +5526,83 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/true);
if (!SentinelVal) {
SentinelVal = CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/false);
- if (!SentinelVal)
- continue;
UseSigned = false;
}
- auto *RdxResult = cast<VPInstruction>(vputils::findRecipe(
+ // Fall back to boolean tracking if IV doesn't wrap.
+ if (!SentinelVal) {
+ auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
+ if (AR->hasNoSignedWrap()) {
+ UseSigned = true;
+ } else if (AR->hasNoUnsignedWrap()) {
+ UseSigned = false;
+ } else {
+ continue;
+ }
+ }
+
+ VPInstruction *RdxResult = cast<VPInstruction>(vputils::findRecipe(
BackedgeVal,
match_fn(m_VPInstruction<VPInstruction::ComputeReductionResult>())));
- // Create the reduction result in the middle block using sentinel directly.
RecurKind MinMaxKind =
UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
: (UseSigned ? RecurKind::SMin : RecurKind::UMin);
VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
FastMathFlags());
- VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
DebugLoc ExitDL = RdxResult->getDebugLoc();
VPBuilder MiddleBuilder(RdxResult);
VPValue *ReducedIV =
MiddleBuilder.createNaryOp(VPInstruction::ComputeReductionResult,
RdxResult->getOperand(0), Flags, ExitDL);
- auto *Cmp =
- MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV, Sentinel, ExitDL);
- VPInstruction *NewRdxResult = MiddleBuilder.createSelect(
- Cmp, ReducedIV, PhiR->getStartValue(), ExitDL);
+
+ VPInstruction *NewRdxResult;
+ VPValue *StartVPV = PhiR->getStartValue();
+ if (SentinelVal) {
+ // Sentinel-based approach: reduce IVs with min/max, compare against
+ // sentinel to detect if condition was ever true, select accordingly.
+ VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
+ auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
+ Sentinel, ExitDL);
+ NewRdxResult = cast<VPInstruction>(
+ MiddleBuilder.createSelect(Cmp, ReducedIV, StartVPV, ExitDL));
+ StartVPV = Sentinel;
+ } else {
+ // Introduce a boolean AnyOf reduction to track if the condition was ever
+ // true in the loop. Use it to select the initial start value, if it was
+ // never true.
+ auto *AnyOfPhi = new VPReductionPHIRecipe(
+ nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
+ RdxUnordered{1}, /*HasUsesOutsideReductionChain=*/false);
+ AnyOfPhi->insertAfter(PhiR);
+
+ VPBuilder LoopBuilder(
+ cast<VPRecipeBase>(BackedgeVal->getDefiningRecipe()));
+ VPValue *AnyOfCond = Cond;
+ if (TrueVal == PhiR)
+ AnyOfCond = LoopBuilder.createNot(Cond);
+ VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, AnyOfCond);
+ AnyOfPhi->setOperand(1, OrVal);
+
+ NewRdxResult =
+ MiddleBuilder.createNaryOp(VPInstruction::ComputeAnyOfResult,
+ {StartVPV, ReducedIV, OrVal}, {}, ExitDL);
+
+ // Initialize the IV reduction phi with the neutral element, not the
+ // original start value, to ensure correct min/max reduction results.
+ unsigned BitWidth = IVSCEV->getType()->getScalarSizeInBits();
+ APInt NeutralVal = UseMax
+ ? (UseSigned ? APInt::getSignedMinValue(BitWidth)
+ : APInt::getZero(BitWidth))
+ : (UseSigned ? APInt::getSignedMaxValue(BitWidth)
+ : APInt::getAllOnes(BitWidth));
+ StartVPV = Plan.getConstantInt(NeutralVal);
+ }
RdxResult->replaceAllUsesWith(NewRdxResult);
RdxResult->eraseFromParent();
auto *NewPhiR = new VPReductionPHIRecipe(
- cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *Sentinel,
+ cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
*BackedgeVal, RdxUnordered{1}, PhiR->hasUsesOutsideReductionChain());
NewPhiR->insertBefore(PhiR);
PhiR->replaceAllUsesWith(NewPhiR);
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
index 2a19ebdd77219..0c176c25b5c61 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
@@ -1256,8 +1256,6 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
; IC1VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
; IC1VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
; IC1VF4-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]]
-; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
-; IC1VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
; IC1VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; IC1VF4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 -1, i64 -2, i64 -3>
@@ -1265,8 +1263,8 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
; IC1VF4: [[VECTOR_BODY]]:
; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
-; IC1VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
; IC1VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1)
; IC1VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
@@ -1280,22 +1278,22 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
; IC1VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
; IC1VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC1VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]]
-; IC1VF4-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP11]]
-; IC1VF4-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]])
-; IC1VF4-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
-; IC1VF4-NEXT: [[TMP15]] = select i1 [[TMP13]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
+; IC1VF4-NEXT: [[TMP12]] = or <4 x i1> [[VEC_PHI1]], [[TMP11]]
+; IC1VF4-NEXT: [[TMP13]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4)
; IC1VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; IC1VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; IC1VF4: [[MIDDLE_BLOCK]]:
-; IC1VF4-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
-; IC1VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP15]], <4 x i1> [[TMP14]], i64 [[TMP17]])
+; IC1VF4-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP13]])
+; IC1VF4-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]])
+; IC1VF4-NEXT: [[RDX_SELECT_CMP:%.*]] = freeze i1 [[TMP17]]
+; IC1VF4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP15]], i64 [[RDX_START]]
; IC1VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
; IC1VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; IC1VF4: [[SCALAR_PH]]:
; IC1VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ]
-; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; IC1VF4-NEXT: br label %[[LOOP:.*]]
; IC1VF4: [[LOOP]]:
; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -1310,7 +1308,7 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1
; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
; IC1VF4: [[EXIT]]:
-; IC1VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ]
+; IC1VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; IC1VF4-NEXT: ret i64 [[COND_LCSSA]]
;
; IC4VF4-LABEL: define i64 @select_decreasing_induction_icmp_non_const_start(
@@ -1319,14 +1317,12 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
; IC4VF4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; IC4VF4-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1)
; IC4VF4-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]]
-; IC4VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4
+; IC4VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
; IC4VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; IC4VF4: [[VECTOR_PH]]:
-; IC4VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
+; IC4VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16
; IC4VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
; IC4VF4-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]]
-; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
-; IC4VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
; IC4VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; IC4VF4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 -1, i64 -2, i64 -3>
@@ -1334,37 +1330,88 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
; IC4VF4: [[VECTOR_BODY]]:
; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
-; IC4VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP34:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP35:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP36:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP37:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP30:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP31:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP32:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP33:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC4VF4-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 -4)
+; IC4VF4-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 -4)
; IC4VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1)
; IC4VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; IC4VF4-NEXT: [[TMP17:%.*]] = add nsw <4 x i64> [[STEP_ADD]], splat (i64 -1)
+; IC4VF4-NEXT: [[TMP6:%.*]] = add nsw <4 x i64> [[STEP_ADD_2]], splat (i64 -1)
+; IC4VF4-NEXT: [[TMP18:%.*]] = add nsw <4 x i64> [[STEP_ADD_3]], splat (i64 -1)
; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
; IC4VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 0
; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 -3
+; IC4VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 -4
+; IC4VF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP26]], i64 -3
+; IC4VF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 -8
+; IC4VF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i64 -3
+; IC4VF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 -12
+; IC4VF4-NEXT: [[TMP38:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i64 -3
; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; IC4VF4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
+; IC4VF4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP14]], align 8
+; IC4VF4-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i64>, ptr [[TMP38]], align 8
; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD8]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD9]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD10]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC4VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
; IC4VF4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 0
; IC4VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP19]], i64 -3
+; IC4VF4-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 -4
+; IC4VF4-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i64 -3
+; IC4VF4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 -8
+; IC4VF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[TMP22]], i64 -3
+; IC4VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 -12
+; IC4VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i64 -3
; IC4VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
+; IC4VF4-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x i64>, ptr [[TMP21]], align 8
+; IC4VF4-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i64>, ptr [[TMP23]], align 8
+; IC4VF4-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x i64>, ptr [[TMP25]], align 8
; IC4VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD15]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD16]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[REVERSE21:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD17]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; IC4VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]]
-; IC4VF4-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP11]]
-; IC4VF4-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]])
-; IC4VF4-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]]
-; IC4VF4-NEXT: [[TMP15]] = select i1 [[TMP13]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
-; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC4VF4-NEXT: [[TMP27:%.*]] = icmp sgt <4 x i64> [[REVERSE11]], [[REVERSE19]]
+; IC4VF4-NEXT: [[TMP28:%.*]] = icmp sgt <4 x i64> [[REVERSE12]], [[REVERSE20]]
+; IC4VF4-NEXT: [[TMP29:%.*]] = icmp sgt <4 x i64> [[REVERSE13]], [[REVERSE21]]
+; IC4VF4-NEXT: [[TMP30]] = or <4 x i1> [[VEC_PHI4]], [[TMP11]]
+; IC4VF4-NEXT: [[TMP31]] = or <4 x i1> [[VEC_PHI5]], [[TMP27]]
+; IC4VF4-NEXT: [[TMP32]] = or <4 x i1> [[VEC_PHI6]], [[TMP28]]
+; IC4VF4-NEXT: [[TMP33]] = or <4 x i1> [[VEC_PHI7]], [[TMP29]]
+; IC4VF4-NEXT: [[TMP34]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
+; IC4VF4-NEXT: [[TMP35]] = select <4 x i1> [[TMP27]], <4 x i64> [[TMP17]], <4 x i64> [[VEC_PHI1]]
+; IC4VF4-NEXT: [[TMP36]] = select <4 x i1> [[TMP28]], <4 x i64> [[TMP6]], <4 x i64> [[VEC_PHI2]]
+; IC4VF4-NEXT: [[TMP37]] = select <4 x i1> [[TMP29]], <4 x i64> [[TMP18]], <4 x i64> [[VEC_PHI3]]
+; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[STEP_ADD_3]], splat (i64 -4)
; IC4VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; IC4VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; IC4VF4: [[MIDDLE_BLOCK]]:
-; IC4VF4-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
-; IC4VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP15]], <4 x i1> [[TMP14]], i64 [[TMP17]])
+; IC4VF4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP34]], <4 x i64> [[TMP35]])
+; IC4VF4-NEXT: [[RDX_MINMAX23:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP36]])
+; IC4VF4-NEXT: [[RDX_MINMAX25:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[RDX_MINMAX23]], <4 x i64> [[TMP37]])
+; IC4VF4-NEXT: [[TMP39:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[RDX_MINMAX25]])
+; IC4VF4-NEXT: [[BIN_RDX:%.*]] = or <4 x i1> [[TMP31]], [[TMP30]]
+; IC4VF4-NEXT: [[BIN_RDX25:%.*]] = or <4 x i1> [[TMP32]], [[BIN_RDX]]
+; IC4VF4-NEXT: [[BIN_RDX24:%.*]] = or <4 x i1> [[TMP33]], [[BIN_RDX25]]
+; IC4VF4-NEXT: [[TMP40:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX24]])
+; IC4VF4-NEXT: [[TMP41:%.*]] = freeze i1 [[TMP40]]
+; IC4VF4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP41]], i64 [[TMP39]], i64 [[RDX_START]]
; IC4VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
; IC4VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; IC4VF4: [[SCALAR_PH]]:
; IC4VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ]
-; IC4VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; IC4VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; IC4VF4-NEXT: br label %[[LOOP:.*]]
; IC4VF4: [[LOOP]]:
; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -1379,27 +1426,100 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1
; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
; IC4VF4: [[EXIT]]:
-; IC4VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ]
+; IC4VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; IC4VF4-NEXT: ret i64 [[COND_LCSSA]]
;
; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_non_const_start(
; IC4VF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; IC4VF1-NEXT: [[ENTRY:.*]]:
+; IC4VF1-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
+; IC4VF1-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1)
+; IC4VF1-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]]
+; IC4VF1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4
+; IC4VF1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[EXIT:.*]], label %[[VECTOR_PH:.*]]
+; IC4VF1: [[VECTOR_PH]]:
+; IC4VF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
+; IC4VF1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; IC4VF1-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]]
; IC4VF1-NEXT: br label %[[LOOP:.*]]
; IC4VF1: [[LOOP]]:
-; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ]
-; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[RDX_START]], %[[ENTRY]] ]
-; IC4VF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; IC4VF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ 9223372036854775807, %[[VECTOR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ 9223372036854775807, %[[VECTOR_PH]] ], [ [[TMP35:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ 9223372036854775807, %[[VECTOR_PH]] ], [ [[TMP36:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ 9223372036854775807, %[[VECTOR_PH]] ], [ [[TMP37:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[VEC_PHI4:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP30:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[VEC_PHI5:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP31:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[VEC_PHI6:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP32:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[VEC_PHI7:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP33:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[IV:%.*]] = sub i64 [[N]], [[INDEX]]
+; IC4VF1-NEXT: [[TMP3:%.*]] = add i64 [[IV]], -1
+; IC4VF1-NEXT: [[TMP4:%.*]] = add i64 [[IV]], -2
+; IC4VF1-NEXT: [[TMP5:%.*]] = add i64 [[IV]], -3
+; IC4VF1-NEXT: [[IV_NEXT:%.*]] = add nsw i64 [[IV]], -1
+; IC4VF1-NEXT: [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1
+; IC4VF1-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP4]], -1
+; IC4VF1-NEXT: [[TMP9:%.*]] = add nsw i64 [[TMP5]], -1
; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]]
+; IC4VF1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; IC4VF1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; IC4VF1-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
; IC4VF1-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
+; IC4VF1-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; IC4VF1-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; IC4VF1-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
; IC4VF1-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]]
+; IC4VF1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP7]]
+; IC4VF1-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP8]]
+; IC4VF1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP9]]
; IC4VF1-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8
+; IC4VF1-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP19]], align 8
+; IC4VF1-NEXT: [[TMP24:%.*]] = load i64, ptr [[TMP20]], align 8
+; IC4VF1-NEXT: [[TMP25:%.*]] = load i64, ptr [[TMP21]], align 8
; IC4VF1-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]]
+; IC4VF1-NEXT: [[TMP27:%.*]] = icmp sgt i64 [[TMP15]], [[TMP23]]
+; IC4VF1-NEXT: [[TMP28:%.*]] = icmp sgt i64 [[TMP16]], [[TMP24]]
+; IC4VF1-NEXT: [[TMP29:%.*]] = icmp sgt i64 [[TMP17]], [[TMP25]]
+; IC4VF1-NEXT: [[TMP30]] = or i1 [[VEC_PHI4]], [[CMP_A_B]]
+; IC4VF1-NEXT: [[TMP31]] = or i1 [[VEC_PHI5]], [[TMP27]]
+; IC4VF1-NEXT: [[TMP32]] = or i1 [[VEC_PHI6]], [[TMP28]]
+; IC4VF1-NEXT: [[TMP33]] = or i1 [[VEC_PHI7]], [[TMP29]]
; IC4VF1-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]]
-; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1
-; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; IC4VF1-NEXT: [[TMP35]] = select i1 [[TMP27]], i64 [[TMP7]], i64 [[VEC_PHI1]]
+; IC4VF1-NEXT: [[TMP36]] = select i1 [[TMP28]], i64 [[TMP8]], i64 [[VEC_PHI2]]
+; IC4VF1-NEXT: [[TMP37]] = select i1 [[TMP29]], i64 [[TMP9]], i64 [[VEC_PHI3]]
+; IC4VF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC4VF1-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC4VF1-NEXT: br i1 [[TMP38]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; IC4VF1: [[MIDDLE_BLOCK]]:
+; IC4VF1-NEXT: [[RDX_MINMAX:%.*]] = call i64 @llvm.smin.i64(i64 [[COND]], i64 [[TMP35]])
+; IC4VF1-NEXT: [[RDX_MINMAX9:%.*]] = call i64 @llvm.smin.i64(i64 [[RDX_MINMAX]], i64 [[TMP36]])
+; IC4VF1-NEXT: [[RDX_MINMAX11:%.*]] = call i64 @llvm.smin.i64(i64 [[RDX_MINMAX9]], i64 [[TMP37]])
+; IC4VF1-NEXT: [[BIN_RDX:%.*]] = or i1 [[TMP31]], [[TMP30]]
+; IC4VF1-NEXT: [[BIN_RDX10:%.*]] = or i1 [[TMP32]], [[BIN_RDX]]
+; IC4VF1-NEXT: [[BIN_RDX11:%.*]] = or i1 [[TMP33]], [[BIN_RDX10]]
+; IC4VF1-NEXT: [[RDX_SELECT_CMP:%.*]] = freeze i1 [[BIN_RDX11]]
+; IC4VF1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX11]], i64 [[RDX_START]]
+; IC4VF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; IC4VF1-NEXT: br i1 [[CMP_N]], label %[[EXIT1:.*]], label %[[EXIT]]
; IC4VF1: [[EXIT]]:
-; IC4VF1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ]
+; IC4VF1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; IC4VF1-NEXT: br label %[[LOOP1:.*]]
+; IC4VF1: [[LOOP1]]:
+; IC4VF1-NEXT: [[IV1:%.*]] = phi i64 [ [[IV_NEXT1:%.*]], %[[LOOP1]] ], [ [[BC_RESUME_VAL]], %[[EXIT]] ]
+; IC4VF1-NEXT: [[RDX1:%.*]] = phi i64 [ [[COND1:%.*]], %[[LOOP1]] ], [ [[BC_MERGE_RDX]], %[[EXIT]] ]
+; IC4VF1-NEXT: [[IV_NEXT1]] = add nsw i64 [[IV1]], -1
+; IC4VF1-NEXT: [[GEP_A_IV1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT1]]
+; IC4VF1-NEXT: [[LD_A1:%.*]] = load i64, ptr [[GEP_A_IV1]], align 8
+; IC4VF1-NEXT: [[GEP_B_IV1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT1]]
+; IC4VF1-NEXT: [[LD_B1:%.*]] = load i64, ptr [[GEP_B_IV1]], align 8
+; IC4VF1-NEXT: [[CMP_A_B1:%.*]] = icmp sgt i64 [[LD_A1]], [[LD_B1]]
+; IC4VF1-NEXT: [[COND1]] = select i1 [[CMP_A_B1]], i64 [[IV_NEXT1]], i64 [[RDX1]]
+; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV1]], 1
+; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[LOOP1]], label %[[EXIT1]], !llvm.loop [[LOOP9:![0-9]+]]
+; IC4VF1: [[EXIT1]]:
+; IC4VF1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND1]], %[[LOOP1]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; IC4VF1-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
index 377afaea8f7fd..e0091cdb5f8a0 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
@@ -150,35 +150,33 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) {
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[II]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
-; CHECK-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[LAST_ACTIVE_MASK]]
-; CHECK-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[TMP3]] = or <4 x i1> [[VEC_PHI1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP6]], <4 x i1> [[TMP5]], i64 [[TMP8]])
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = freeze i1 [[TMP8]]
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP6]], i64 [[II]]
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ]
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -193,7 +191,7 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) {
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
index 88bb91efa0410..0e07de9d5b403 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll
@@ -17,8 +17,6 @@ define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %
; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
-; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0
; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
@@ -26,28 +24,28 @@ define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %
; CHECK-VF4IC1: [[VECTOR_BODY]]:
; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]]
; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3)
-; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
-; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]]
-; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[TMP4]] = or <4 x i1> [[VEC_PHI1]], [[TMP3]]
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
-; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
-; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]])
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP5]])
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = freeze i1 [[TMP9]]
+; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP7]], i64 [[RDX_START]]
; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
; CHECK-VF4IC1: [[SCALAR_PH]]:
; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ]
-; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -60,7 +58,7 @@ define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]:
-; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: br label %[[EXIT]]
; CHECK-VF4IC1: [[EXIT]]:
; CHECK-VF4IC1-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
@@ -73,43 +71,71 @@ define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %
; CHECK-VF4IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]:
; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_START]]
-; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK-VF4IC4: [[VECTOR_PH]]:
-; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
-; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0
; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK-VF4IC4: [[VECTOR_BODY]]:
; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND1:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]]
; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 4
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 8
+; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 12
; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP6]], align 4
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP4]], align 4
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i64>, ptr [[TMP5]], align 4
; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3)
-; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
-; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]]
-; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
-; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD8]], splat (i64 3)
+; CHECK-VF4IC4-NEXT: [[TMP21:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD9]], splat (i64 3)
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD10]], splat (i64 3)
+; CHECK-VF4IC4-NEXT: [[TMP10]] = or <4 x i1> [[VEC_PHI4]], [[TMP3]]
+; CHECK-VF4IC4-NEXT: [[TMP18]] = or <4 x i1> [[VEC_PHI5]], [[TMP7]]
+; CHECK-VF4IC4-NEXT: [[TMP12]] = or <4 x i1> [[VEC_PHI6]], [[TMP21]]
+; CHECK-VF4IC4-NEXT: [[TMP13]] = or <4 x i1> [[VEC_PHI7]], [[TMP9]]
+; CHECK-VF4IC4-NEXT: [[TMP14]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND1]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[TMP15]] = select <4 x i1> [[TMP7]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT: [[TMP16]] = select <4 x i1> [[TMP21]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT: [[TMP17]] = select <4 x i1> [[TMP9]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
-; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
-; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP14]], <4 x i64> [[TMP15]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX12:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP16]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX14:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX12]], <4 x i64> [[TMP17]])
+; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX14]])
+; CHECK-VF4IC4-NEXT: [[BIN_RDX:%.*]] = or <4 x i1> [[TMP18]], [[TMP10]]
+; CHECK-VF4IC4-NEXT: [[BIN_RDX14:%.*]] = or <4 x i1> [[TMP12]], [[BIN_RDX]]
+; CHECK-VF4IC4-NEXT: [[BIN_RDX13:%.*]] = or <4 x i1> [[TMP13]], [[BIN_RDX14]]
+; CHECK-VF4IC4-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX13]])
+; CHECK-VF4IC4-NEXT: [[TMP22:%.*]] = freeze i1 [[TMP20]]
+; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP22]], i64 [[TMP19]], i64 [[RDX_START]]
; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
; CHECK-VF4IC4: [[SCALAR_PH]]:
; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ]
-; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -122,7 +148,7 @@ define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]:
-; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: br label %[[EXIT]]
; CHECK-VF4IC4: [[EXIT]]:
; CHECK-VF4IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
@@ -134,19 +160,78 @@ define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %
; CHECK-VF1IC4-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]]
; CHECK-VF1IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]]
; CHECK-VF1IC4: [[FOR_BODY_PREHEADER]]:
+; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[IV_START]]
+; CHECK-VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP5]], 4
+; CHECK-VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4: [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], 4
+; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP5]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]]
; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF1IC4: [[FOR_BODY]]:
-; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ]
-; CHECK-VF1IC4-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF1IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[RDX_07:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI5:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI6:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI7:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[IV:%.*]] = add i64 [[IV_START]], [[INDEX]]
+; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT: [[TMP3:%.*]] = add i64 [[IV]], 2
+; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = add i64 [[IV]], 3
; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 4
+; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 4
+; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 4
; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3
+; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = icmp sgt i64 [[TMP10]], 3
+; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = icmp sgt i64 [[TMP11]], 3
+; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = icmp sgt i64 [[TMP12]], 3
+; CHECK-VF1IC4-NEXT: [[TMP17]] = or i1 [[VEC_PHI4]], [[CMP1]]
+; CHECK-VF1IC4-NEXT: [[TMP18]] = or i1 [[VEC_PHI5]], [[TMP14]]
+; CHECK-VF1IC4-NEXT: [[TMP19]] = or i1 [[VEC_PHI6]], [[TMP15]]
+; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI7]], [[TMP16]]
; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]]
-; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT: [[TMP22]] = select i1 [[TMP14]], i64 [[TMP2]], i64 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT: [[TMP23]] = select i1 [[TMP15]], i64 [[TMP3]], i64 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT: [[TMP24]] = select i1 [[TMP16]], i64 [[TMP4]], i64 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF1IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[COND]], i64 [[TMP22]])
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX9:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP23]])
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX11:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX9]], i64 [[TMP24]])
+; CHECK-VF1IC4-NEXT: [[BIN_RDX:%.*]] = or i1 [[TMP18]], [[TMP17]]
+; CHECK-VF1IC4-NEXT: [[BIN_RDX10:%.*]] = or i1 [[TMP19]], [[BIN_RDX]]
+; CHECK-VF1IC4-NEXT: [[BIN_RDX11:%.*]] = or i1 [[TMP20]], [[BIN_RDX10]]
+; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = freeze i1 [[BIN_RDX11]]
+; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX11]], i64 [[RDX_START]]
+; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP5]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4: [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY1:.*]]
+; CHECK-VF1IC4: [[FOR_BODY1]]:
+; CHECK-VF1IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY1]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT: [[RDX_7:%.*]] = phi i64 [ [[COND1:%.*]], %[[FOR_BODY1]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
+; CHECK-VF1IC4-NEXT: [[TMP26:%.*]] = load i64, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF1IC4-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP26]], 3
+; CHECK-VF1IC4-NEXT: [[COND1]] = select i1 [[CMP2]], i64 [[IV1]], i64 [[RDX_7]]
+; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK-VF1IC4: [[EXIT_LOOPEXIT]]:
-; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND1]], %[[FOR_BODY1]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF1IC4-NEXT: br label %[[EXIT]]
; CHECK-VF1IC4: [[EXIT]]:
; CHECK-VF1IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ]
@@ -370,3 +455,8 @@ exit:
; CHECK-VF4IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
; CHECK-VF4IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
;.
+; CHECK-VF1IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF1IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF1IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF1IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
index a889c2425a339..ab2b24dc51683 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
@@ -1959,40 +1959,38 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = add i64 -9223372036854775808, [[N_VEC]]
-; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
-; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK-VF4IC1: [[VECTOR_BODY]]:
; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775807, i64 -9223372036854775806, i64 -9223372036854775805>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
-; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]]
-; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[TMP4]] = or <4 x i1> [[VEC_PHI1]], [[TMP3]]
+; CHECK-VF4IC1-NEXT: [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
-; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
-; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]])
+; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP5]])
+; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = freeze i1 [[TMP9]]
+; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP7]], i64 [[RDX_START]]
; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK-VF4IC1: [[SCALAR_PH]]:
; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
; CHECK-VF4IC1-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; CHECK-VF4IC1-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC1-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ]
; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
@@ -2005,52 +2003,86 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-VF4IC1: [[EXIT]]:
-; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: ret i64 [[COND_LCSSA]]
;
; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
-; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK-VF4IC4: [[VECTOR_PH]]:
-; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = add i64 -9223372036854775808, [[N_VEC]]
-; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
-; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK-VF4IC4: [[VECTOR_BODY]]:
; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775807, i64 -9223372036854775806, i64 -9223372036854775805>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775807, i64 -9223372036854775806, i64 -9223372036854775805>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 4
+; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 8
+; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 12
; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 4
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 8
+; CHECK-VF4IC4-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 12
; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i64>, ptr [[TMP21]], align 8
; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
-; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]]
-; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
-; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD8]], [[WIDE_LOAD12]]
+; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD9]], [[WIDE_LOAD13]]
+; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD10]], [[WIDE_LOAD14]]
+; CHECK-VF4IC4-NEXT: [[TMP13]] = or <4 x i1> [[VEC_PHI4]], [[TMP3]]
+; CHECK-VF4IC4-NEXT: [[TMP14]] = or <4 x i1> [[VEC_PHI5]], [[TMP10]]
+; CHECK-VF4IC4-NEXT: [[TMP15]] = or <4 x i1> [[VEC_PHI6]], [[TMP24]]
+; CHECK-VF4IC4-NEXT: [[TMP16]] = or <4 x i1> [[VEC_PHI7]], [[TMP25]]
+; CHECK-VF4IC4-NEXT: [[TMP17]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND1]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[TMP18]] = select <4 x i1> [[TMP10]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT: [[TMP19]] = select <4 x i1> [[TMP24]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT: [[TMP20]] = select <4 x i1> [[TMP25]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
-; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
-; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP17]], <4 x i64> [[TMP18]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX16:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP19]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX18:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX16]], <4 x i64> [[TMP20]])
+; CHECK-VF4IC4-NEXT: [[TMP22:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX18]])
+; CHECK-VF4IC4-NEXT: [[BIN_RDX:%.*]] = or <4 x i1> [[TMP14]], [[TMP13]]
+; CHECK-VF4IC4-NEXT: [[BIN_RDX18:%.*]] = or <4 x i1> [[TMP15]], [[BIN_RDX]]
+; CHECK-VF4IC4-NEXT: [[BIN_RDX17:%.*]] = or <4 x i1> [[TMP16]], [[BIN_RDX18]]
+; CHECK-VF4IC4-NEXT: [[TMP23:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX17]])
+; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = freeze i1 [[TMP23]]
+; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP26]], i64 [[TMP22]], i64 [[RDX_START]]
; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK-VF4IC4: [[SCALAR_PH]]:
; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL19:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
; CHECK-VF4IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; CHECK-VF4IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ]
+; CHECK-VF4IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL19]], %[[SCALAR_PH]] ]
; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
@@ -2063,29 +2095,99 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-VF4IC4: [[EXIT]]:
-; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: ret i64 [[COND_LCSSA]]
;
; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound(
; CHECK-VF1IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4: [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT: [[TMP7:%.*]] = add i64 -9223372036854775808, [[N_VEC]]
; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF1IC4: [[FOR_BODY]]:
-; CHECK-VF1IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ]
-; CHECK-VF1IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT: [[IV_I:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP32:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP33:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP34:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI5:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP28:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI6:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI7:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP30:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[IV_J:%.*]] = add i64 -9223372036854775808, [[IV_I]]
+; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = add i64 [[IV_J]], 1
+; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[IV_J]], 2
+; CHECK-VF1IC4-NEXT: [[TMP3:%.*]] = add i64 [[IV_J]], 3
+; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = add i64 [[IV_I]], 1
+; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = add i64 [[IV_I]], 2
+; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = add i64 [[IV_I]], 3
; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
+; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
; CHECK-VF1IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
+; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP5]]
+; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP6]]
; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP16]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP17]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP18]], align 8
; CHECK-VF1IC4-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp sgt i64 [[TMP12]], [[TMP20]]
+; CHECK-VF1IC4-NEXT: [[TMP25:%.*]] = icmp sgt i64 [[TMP13]], [[TMP21]]
+; CHECK-VF1IC4-NEXT: [[TMP26:%.*]] = icmp sgt i64 [[TMP14]], [[TMP22]]
+; CHECK-VF1IC4-NEXT: [[TMP27]] = or i1 [[VEC_PHI4]], [[CMP2]]
+; CHECK-VF1IC4-NEXT: [[TMP28]] = or i1 [[VEC_PHI5]], [[TMP24]]
+; CHECK-VF1IC4-NEXT: [[TMP29]] = or i1 [[VEC_PHI6]], [[TMP25]]
+; CHECK-VF1IC4-NEXT: [[TMP30]] = or i1 [[VEC_PHI7]], [[TMP26]]
; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV_J]], i64 [[RDX]]
-; CHECK-VF1IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV_I]], 1
-; CHECK-VF1IC4-NEXT: [[INC3]] = add nsw i64 [[IV_J]], 1
+; CHECK-VF1IC4-NEXT: [[TMP32]] = select i1 [[TMP24]], i64 [[TMP11]], i64 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT: [[TMP33]] = select i1 [[TMP25]], i64 [[TMP2]], i64 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT: [[TMP34]] = select i1 [[TMP26]], i64 [[TMP3]], i64 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV_I]], 4
+; CHECK-VF1IC4-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT: br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-VF1IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[COND]], i64 [[TMP32]])
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX9:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP33]])
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX11:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX9]], i64 [[TMP34]])
+; CHECK-VF1IC4-NEXT: [[BIN_RDX:%.*]] = or i1 [[TMP28]], [[TMP27]]
+; CHECK-VF1IC4-NEXT: [[BIN_RDX10:%.*]] = or i1 [[TMP29]], [[BIN_RDX]]
+; CHECK-VF1IC4-NEXT: [[BIN_RDX11:%.*]] = or i1 [[TMP30]], [[BIN_RDX10]]
+; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = freeze i1 [[BIN_RDX11]]
+; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX11]], i64 [[RDX_START]]
+; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4: [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL12:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY1:.*]]
+; CHECK-VF1IC4: [[FOR_BODY1]]:
+; CHECK-VF1IC4-NEXT: [[IV_J1:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY1]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT: [[IV_I1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY1]] ], [ [[BC_RESUME_VAL12]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT: [[RDX1:%.*]] = phi i64 [ [[COND1:%.*]], %[[FOR_BODY1]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I1]]
+; CHECK-VF1IC4-NEXT: [[TMP36:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; CHECK-VF1IC4-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I1]]
+; CHECK-VF1IC4-NEXT: [[TMP37:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; CHECK-VF1IC4-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP36]], [[TMP37]]
+; CHECK-VF1IC4-NEXT: [[COND1]] = select i1 [[CMP3]], i64 [[IV_J1]], i64 [[RDX1]]
+; CHECK-VF1IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV_I1]], 1
+; CHECK-VF1IC4-NEXT: [[INC3]] = add nsw i64 [[IV_J1]], 1
; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY1]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-VF1IC4: [[EXIT]]:
-; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND1]], %[[FOR_BODY1]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF1IC4-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
@@ -2121,8 +2223,6 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP12]], 4
; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP12]], [[N_MOD_VF]]
; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = add i64 [[IVSTART]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
-; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0
; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
@@ -2130,30 +2230,30 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC1: [[VECTOR_BODY]]:
; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IVSTART]], [[INDEX]]
; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
; CHECK-VF4IC1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD3]]
-; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
-; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]]
-; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI1]], [[TMP4]]
+; CHECK-VF4IC1-NEXT: [[TMP6]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
-; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
-; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP8]], <4 x i1> [[TMP7]], i64 [[TMP10]])
+; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP6]])
+; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = freeze i1 [[TMP10]]
+; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 [[RDX_START]]
; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]]
; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK-VF4IC1: [[SCALAR_PH]]:
; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC1: [[FOR_BODY]]:
; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2168,52 +2268,86 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK-VF4IC1: [[EXIT]]:
-; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: ret i64 [[COND_LCSSA]]
;
; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value(
; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]:
; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[IVSTART]]
-; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP12]], 4
+; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP12]], 16
; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK-VF4IC4: [[VECTOR_PH]]:
-; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP12]], 4
+; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP12]], 16
; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP12]], [[N_MOD_VF]]
; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = add i64 [[IVSTART]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
-; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0
; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK-VF4IC4: [[VECTOR_BODY]]:
; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_IND1:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IVSTART]], [[INDEX]]
; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 4
+; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 8
+; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 12
; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i64 4
+; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i64 8
+; CHECK-VF4IC4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i64 12
; CHECK-VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; CHECK-VF4IC4-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i64>, ptr [[TMP22]], align 8
; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD3]]
-; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
-; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]]
-; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
-; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD8]], [[WIDE_LOAD12]]
+; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD9]], [[WIDE_LOAD13]]
+; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD10]], [[WIDE_LOAD14]]
+; CHECK-VF4IC4-NEXT: [[TMP14]] = or <4 x i1> [[VEC_PHI4]], [[TMP4]]
+; CHECK-VF4IC4-NEXT: [[TMP15]] = or <4 x i1> [[VEC_PHI5]], [[TMP11]]
+; CHECK-VF4IC4-NEXT: [[TMP16]] = or <4 x i1> [[VEC_PHI6]], [[TMP25]]
+; CHECK-VF4IC4-NEXT: [[TMP17]] = or <4 x i1> [[VEC_PHI7]], [[TMP26]]
+; CHECK-VF4IC4-NEXT: [[TMP18]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_IND1]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT: [[TMP19]] = select <4 x i1> [[TMP11]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT: [[TMP20]] = select <4 x i1> [[TMP25]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT: [[TMP21]] = select <4 x i1> [[TMP26]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
-; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
-; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP8]], <4 x i1> [[TMP7]], i64 [[TMP10]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP18]], <4 x i64> [[TMP19]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX16:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP20]])
+; CHECK-VF4IC4-NEXT: [[RDX_MINMAX18:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX16]], <4 x i64> [[TMP21]])
+; CHECK-VF4IC4-NEXT: [[TMP23:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX18]])
+; CHECK-VF4IC4-NEXT: [[BIN_RDX:%.*]] = or <4 x i1> [[TMP15]], [[TMP14]]
+; CHECK-VF4IC4-NEXT: [[BIN_RDX18:%.*]] = or <4 x i1> [[TMP16]], [[BIN_RDX]]
+; CHECK-VF4IC4-NEXT: [[BIN_RDX17:%.*]] = or <4 x i1> [[TMP17]], [[BIN_RDX18]]
+; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX17]])
+; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = freeze i1 [[TMP24]]
+; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP27]], i64 [[TMP23]], i64 [[RDX_START]]
; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]]
; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK-VF4IC4: [[SCALAR_PH]]:
; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF4IC4: [[FOR_BODY]]:
; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2228,27 +2362,94 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK-VF4IC4: [[EXIT]]:
-; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: ret i64 [[COND_LCSSA]]
;
; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value(
; CHECK-VF1IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) {
; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[IVSTART]]
+; CHECK-VF1IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP5]], 4
+; CHECK-VF1IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4: [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], 4
+; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP5]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = add i64 [[IVSTART]], [[N_VEC]]
; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]]
; CHECK-VF1IC4: [[FOR_BODY]]:
-; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ]
-; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP30:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP31:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ -9223372036854775808, %[[VECTOR_PH]] ], [ [[TMP32:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI5:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI6:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[VEC_PHI7:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP28:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[IV:%.*]] = add i64 [[IVSTART]], [[INDEX]]
+; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = add i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT: [[TMP3:%.*]] = add i64 [[IV]], 2
+; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = add i64 [[IV]], 3
; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
; CHECK-VF1IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-VF1IC4-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP16]], align 8
; CHECK-VF1IC4-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; CHECK-VF1IC4-NEXT: [[TMP22:%.*]] = icmp sgt i64 [[TMP10]], [[TMP18]]
+; CHECK-VF1IC4-NEXT: [[TMP23:%.*]] = icmp sgt i64 [[TMP11]], [[TMP19]]
+; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp sgt i64 [[TMP12]], [[TMP20]]
+; CHECK-VF1IC4-NEXT: [[TMP25]] = or i1 [[VEC_PHI4]], [[CMP2]]
+; CHECK-VF1IC4-NEXT: [[TMP26]] = or i1 [[VEC_PHI5]], [[TMP22]]
+; CHECK-VF1IC4-NEXT: [[TMP27]] = or i1 [[VEC_PHI6]], [[TMP23]]
+; CHECK-VF1IC4-NEXT: [[TMP28]] = or i1 [[VEC_PHI7]], [[TMP24]]
; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]]
-; CHECK-VF1IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT: [[TMP30]] = select i1 [[TMP22]], i64 [[TMP2]], i64 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT: [[TMP31]] = select i1 [[TMP23]], i64 [[TMP3]], i64 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT: [[TMP32]] = select i1 [[TMP24]], i64 [[TMP4]], i64 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT: br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-VF1IC4: [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[COND]], i64 [[TMP30]])
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX9:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP31]])
+; CHECK-VF1IC4-NEXT: [[RDX_MINMAX11:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX9]], i64 [[TMP32]])
+; CHECK-VF1IC4-NEXT: [[BIN_RDX:%.*]] = or i1 [[TMP26]], [[TMP25]]
+; CHECK-VF1IC4-NEXT: [[BIN_RDX10:%.*]] = or i1 [[TMP27]], [[BIN_RDX]]
+; CHECK-VF1IC4-NEXT: [[BIN_RDX11:%.*]] = or i1 [[TMP28]], [[BIN_RDX10]]
+; CHECK-VF1IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = freeze i1 [[BIN_RDX11]]
+; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX11]], i64 [[RDX_START]]
+; CHECK-VF1IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP5]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4: [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY1:.*]]
+; CHECK-VF1IC4: [[FOR_BODY1]]:
+; CHECK-VF1IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY1]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT: [[RDX1:%.*]] = phi i64 [ [[COND1:%.*]], %[[FOR_BODY1]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-VF1IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
+; CHECK-VF1IC4-NEXT: [[TMP34:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; CHECK-VF1IC4-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV1]]
+; CHECK-VF1IC4-NEXT: [[TMP35:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; CHECK-VF1IC4-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP34]], [[TMP35]]
+; CHECK-VF1IC4-NEXT: [[COND1]] = select i1 [[CMP3]], i64 [[IV1]], i64 [[RDX1]]
+; CHECK-VF1IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1
; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY1]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK-VF1IC4: [[EXIT]]:
-; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND1]], %[[FOR_BODY1]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF1IC4-NEXT: ret i64 [[COND_LCSSA]]
;
entry:
@@ -2299,7 +2500,7 @@ define i64 @select_icmp_xor_not_iv(ptr %a, i64 %n) {
; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-VF4IC1-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; CHECK-VF4IC1: [[MIDDLE_BLOCK]]:
; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP3]])
; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP5]], 9223372036854775807
@@ -2320,7 +2521,7 @@ define i64 @select_icmp_xor_not_iv(ptr %a, i64 %n) {
; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[NOT_IV]], i64 [[RDX]]
; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]:
; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC1-NEXT: br label %[[EXIT]]
@@ -2373,7 +2574,7 @@ define i64 @select_icmp_xor_not_iv(ptr %a, i64 %n) {
; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD_3]], splat (i64 4)
; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-VF4IC4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; CHECK-VF4IC4: [[MIDDLE_BLOCK]]:
; CHECK-VF4IC4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP12]], <4 x i64> [[TMP13]])
; CHECK-VF4IC4-NEXT: [[RDX_MINMAX7:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP14]])
@@ -2397,7 +2598,7 @@ define i64 @select_icmp_xor_not_iv(ptr %a, i64 %n) {
; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[NOT_IV]], i64 [[RDX]]
; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]:
; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF4IC4-NEXT: br label %[[EXIT]]
@@ -2448,7 +2649,7 @@ define i64 @select_icmp_xor_not_iv(ptr %a, i64 %n) {
; CHECK-VF1IC4-NEXT: [[TMP22]] = select i1 [[TMP14]], i64 [[TMP18]], i64 [[VEC_PHI3]]
; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-VF1IC4-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF1IC4-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-VF1IC4-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; CHECK-VF1IC4: [[MIDDLE_BLOCK]]:
; CHECK-VF1IC4-NEXT: [[RDX_MINMAX:%.*]] = call i64 @llvm.smin.i64(i64 [[TMP19]], i64 [[TMP20]])
; CHECK-VF1IC4-NEXT: [[RDX_MINMAX4:%.*]] = call i64 @llvm.smin.i64(i64 [[RDX_MINMAX]], i64 [[TMP21]])
@@ -2471,7 +2672,7 @@ define i64 @select_icmp_xor_not_iv(ptr %a, i64 %n) {
; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[NOT_IV]], i64 [[RDX]]
; CHECK-VF1IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1
; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
; CHECK-VF1IC4: [[EXIT_LOOPEXIT]]:
; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-VF1IC4-NEXT: br label %[[EXIT]]
>From 114ea96a87b584bd71a1ff4bab768036421f0d5e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 12 Feb 2026 18:51:59 +0000
Subject: [PATCH 2/2] !fixup address comments, thanks!
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 26 ++++++++-----------
1 file changed, 11 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3f15a2a8aee37..ea23229c89ceb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -40,6 +40,7 @@
#include "llvm/IR/Metadata.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/TypeSize.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
using namespace llvm;
@@ -5505,16 +5506,17 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
UseSigned = false;
}
- // Fall back to boolean tracking if IV doesn't wrap.
+ // If no sentinel was found, fall back to a boolean AnyOf reduction to track
+ // if the condition was ever true. Requires the IV to not wrap, otherwise we
+ // cannot use min/max.
if (!SentinelVal) {
auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
- if (AR->hasNoSignedWrap()) {
+ if (AR->hasNoSignedWrap())
UseSigned = true;
- } else if (AR->hasNoUnsignedWrap()) {
+ else if (AR->hasNoUnsignedWrap())
UseSigned = false;
- } else {
+ else
continue;
- }
}
VPInstruction *RdxResult = cast<VPInstruction>(vputils::findRecipe(
@@ -5548,12 +5550,11 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
// true in the loop. Use it to select the initial start value, if it was
// never true.
auto *AnyOfPhi = new VPReductionPHIRecipe(
- nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
+ /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
RdxUnordered{1}, /*HasUsesOutsideReductionChain=*/false);
AnyOfPhi->insertAfter(PhiR);
- VPBuilder LoopBuilder(
- cast<VPRecipeBase>(BackedgeVal->getDefiningRecipe()));
+ VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
VPValue *AnyOfCond = Cond;
if (TrueVal == PhiR)
AnyOfCond = LoopBuilder.createNot(Cond);
@@ -5566,13 +5567,8 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
// Initialize the IV reduction phi with the neutral element, not the
// original start value, to ensure correct min/max reduction results.
- unsigned BitWidth = IVSCEV->getType()->getScalarSizeInBits();
- APInt NeutralVal = UseMax
- ? (UseSigned ? APInt::getSignedMinValue(BitWidth)
- : APInt::getZero(BitWidth))
- : (UseSigned ? APInt::getSignedMaxValue(BitWidth)
- : APInt::getAllOnes(BitWidth));
- StartVPV = Plan.getConstantInt(NeutralVal);
+ StartVPV = Plan.getOrAddLiveIn(
+ getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
}
RdxResult->replaceAllUsesWith(NewRdxResult);
RdxResult->eraseFromParent();
More information about the llvm-commits
mailing list