[llvm] [VPlan] Optimize FindLast of (binop %IV, live-in) by sinking. (PR #183911)

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 10 09:54:38 PDT 2026


https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/183911

>From b922b409a5439feca160232a7ba8bc124af019ec Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 26 Feb 2026 22:18:37 +0000
Subject: [PATCH 1/3] [VPlan] Optimize FindLast of binary op of an induction.

When we are finding the last occurrence of a value of an expression
that depends on an induction, we can vectorize this by just selecting
the IV and sinking the expression in the middle block

This follows one of @ayalz's suggestions during earlier discussions for
adding support for CAS/FindLast patterns.

This patch starts with the simplest case, where the selected value is a
simple binary expression of a wide IV and a loop-invariant operand.

This should always be profitable, as the current restriction to binary
operators ensures that the width of the wide IV matches the original
reduction width, we won't introduce any new, wider reduction phi
recipes, and remove the boolean reduction + the horizontal reduction in
the loop.

I will also share follow-ups to support sinking loads soon.
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  77 +++++++-
 .../find-last-iv-sinkable-expr.ll             | 143 ++++++++++----
 .../LoopVectorize/iv-select-cmp-decreasing.ll | 187 +++++++-----------
 .../Transforms/LoopVectorize/iv-select-cmp.ll | 139 +++++++------
 4 files changed, 314 insertions(+), 232 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b8329bb396ff6..ab0c93be7ec77 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5663,6 +5663,43 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
   }
 }
 
+/// Check if \p V is a binary expression of a widened IV and a loop-invariant
+/// value. Returns the widened IV if found, nullptr otherwise.
+static VPWidenIntOrFpInductionRecipe *
+getExpressionIV(VPValue *V, VPRegionBlock *VectorLoopRegion) {
+  auto *BinOp = dyn_cast<VPWidenRecipe>(V);
+  if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()))
+    return nullptr;
+
+  VPWidenIntOrFpInductionRecipe *WideIV = nullptr;
+  for (VPValue *Op : BinOp->operands()) {
+    // Operands must be a single IV and others must be loop invariant.
+    auto *Cand = dyn_cast<VPWidenIntOrFpInductionRecipe>(Op);
+    if (Cand) {
+      if (WideIV)
+        return nullptr;
+      WideIV = Cand;
+      continue;
+    }
+    if (!Op->isDefinedOutsideLoopRegions())
+      return nullptr;
+  }
+  return WideIV;
+}
+
+/// Create a scalar version of the binary op \p V in the middle block,
+/// replacing \p WidenIV with \p ScalarIV.
+static VPValue *sinkBinOpToMiddleBlock(VPValue *V, VPValue *ScalarIV,
+                                       VPWidenIntOrFpInductionRecipe *WidenIV,
+                                       VPBuilder &Builder) {
+  auto *BinOp = cast<VPWidenRecipe>(V->getDefiningRecipe());
+  auto Ops = to_vector(map_range(BinOp->operands(), [&](VPValue *Op) {
+    return Op == WidenIV ? ScalarIV : Op;
+  }));
+  return Builder.createNaryOp(BinOp->getOpcode(), Ops, VPIRFlags(*BinOp),
+                              BinOp->getDebugLoc());
+}
+
 void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
                                                PredicatedScalarEvolution &PSE,
                                                Loop &L) {
@@ -5709,11 +5746,17 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
                                     m_VPValue(FalseVal))))
       continue;
 
-    // The non-phi operand of the select is the IV.
-    assert(is_contained(CondSelect->getDefiningRecipe()->operands(), PhiR));
+    // The non-phi operand of the select is the IV (or expression of IV).
+    assert(is_contained(BackedgeVal->getDefiningRecipe()->operands(), PhiR));
     VPValue *IV = TrueVal == PhiR ? FalseVal : TrueVal;
 
-    const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(IV, PSE, &L);
+    // Check if IV is a simple expression of a widened IV.
+    // If so, we can track the underlying IV and sink the expression.
+    VPWidenIntOrFpInductionRecipe *ExpressionIV =
+        getExpressionIV(IV, VectorLoopRegion);
+    VPValue *IVToTrack = ExpressionIV ? ExpressionIV : IV;
+
+    const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(IVToTrack, PSE, &L);
     const SCEV *Step;
     if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step))))
       continue;
@@ -5750,6 +5793,17 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
         BackedgeVal,
         match_fn(m_VPInstruction<VPInstruction::ComputeReductionResult>())));
 
+    VPValue *NewBackedgeVal = CondSelect;
+    // If IV is an expression, create a new select that tracks the underlying
+    // IV directly and update the backedge value.
+    if (ExpressionIV) {
+      auto *SelectR = BackedgeVal->getDefiningRecipe();
+      VPBuilder LoopBuilder(SelectR);
+      NewBackedgeVal = LoopBuilder.createSelect(Cond, ExpressionIV, PhiR,
+                                                SelectR->getDebugLoc());
+    }
+
+    // Create the reduction result in the middle block using sentinel directly.
     RecurKind MinMaxKind =
         UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
                : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
@@ -5757,9 +5811,8 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
                     FastMathFlags());
     DebugLoc ExitDL = RdxResult->getDebugLoc();
     VPBuilder MiddleBuilder(RdxResult);
-    VPValue *ReducedIV =
-        MiddleBuilder.createNaryOp(VPInstruction::ComputeReductionResult,
-                                   RdxResult->getOperand(0), Flags, ExitDL);
+    VPValue *ReducedIV = MiddleBuilder.createNaryOp(
+        VPInstruction::ComputeReductionResult, NewBackedgeVal, Flags, ExitDL);
 
     VPValue *NewRdxResult;
     VPValue *StartVPV = PhiR->getStartValue();
@@ -5769,8 +5822,15 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
       VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
       auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
                                            Sentinel, ExitDL);
+
+      // If IV was an expression, sink the expression to the middle block.
+      VPValue *ResultVal = ReducedIV;
+      if (ExpressionIV)
+        ResultVal =
+            sinkBinOpToMiddleBlock(IV, ReducedIV, ExpressionIV, MiddleBuilder);
+
       NewRdxResult =
-          MiddleBuilder.createSelect(Cmp, ReducedIV, StartVPV, ExitDL);
+          MiddleBuilder.createSelect(Cmp, ResultVal, StartVPV, ExitDL);
       StartVPV = Sentinel;
     } else {
       // Introduce a boolean AnyOf reduction to track if the condition was ever
@@ -5802,7 +5862,8 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
 
     auto *NewPhiR = new VPReductionPHIRecipe(
         cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
-        *CondSelect, RdxUnordered{1}, {}, PhiR->hasUsesOutsideReductionChain());
+        *NewBackedgeVal, RdxUnordered{1}, {},
+        PhiR->hasUsesOutsideReductionChain());
     NewPhiR->insertBefore(PhiR);
     PhiR->replaceAllUsesWith(NewPhiR);
     PhiR->eraseFromParent();
diff --git a/llvm/test/Transforms/LoopVectorize/find-last-iv-sinkable-expr.ll b/llvm/test/Transforms/LoopVectorize/find-last-iv-sinkable-expr.ll
index 0bbc965098766..10404d6ae5aa6 100644
--- a/llvm/test/Transforms/LoopVectorize/find-last-iv-sinkable-expr.ll
+++ b/llvm/test/Transforms/LoopVectorize/find-last-iv-sinkable-expr.ll
@@ -16,23 +16,21 @@ define i64 @findlast_shl_expr(ptr %a, i64 %n) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP3:%.*]] = shl <4 x i64> [[VEC_IND]], splat (i64 2)
-; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
-; CHECK-NEXT:    [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP2]], <4 x i1> [[TMP0]]
-; CHECK-NEXT:    [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP2]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[TMP3]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 -1)
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], -9223372036854775808
+; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], i64 [[TMP10]], i64 -1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[DONE:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -85,23 +83,21 @@ define i64 @findlast_mul_expr(ptr %a, i64 %n) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i64> [[VEC_IND]], splat (i64 3)
-; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
-; CHECK-NEXT:    [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP2]], <4 x i1> [[TMP0]]
-; CHECK-NEXT:    [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP2]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[TMP3]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 -1)
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], -9223372036854775808
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP6]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], i64 [[TMP10]], i64 -1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[DONE:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -232,23 +228,21 @@ define i64 @findlast_or_expr(ptr %a, i64 %n) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[VEC_IND]], splat (i64 1)
-; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
-; CHECK-NEXT:    [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP2]], <4 x i1> [[TMP0]]
-; CHECK-NEXT:    [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP2]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[TMP3]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 -1)
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], -9223372036854775808
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], i64 [[TMP10]], i64 -1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[DONE:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -308,8 +302,8 @@ define i64 @findlast_non_canonical_iv_with_expr(ptr %a, i64 %n) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 10, i64 12, i64 14, i64 16>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 10, i64 12, i64 14, i64 16>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 10, [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
@@ -329,15 +323,15 @@ define i64 @findlast_non_canonical_iv_with_expr(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 2
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 3
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq <4 x i64> [[TMP21]], splat (i64 42)
-; CHECK-NEXT:    [[VEC_IND:%.*]] = add <4 x i64> [[VEC_IND1]], splat (i64 100)
 ; CHECK-NEXT:    [[TMP23]] = select <4 x i1> [[TMP22]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND1]], splat (i64 8)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP23]])
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne i64 [[TMP27]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP23]])
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne i64 [[TMP30]], -9223372036854775808
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP30]], 100
 ; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP26]], i64 [[TMP27]], i64 -1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[DONE:.*]], label %[[SCALAR_PH]]
@@ -549,6 +543,79 @@ done:
   ret i32 %sel
 }
 
+; Test sdiv where the IV is the second operand (divisor). Verifies that
+; operand order is preserved when sinking the expression to the middle block.
+; IV starts at 1 to avoid division by zero.
+define i64 @findlast_sdiv_iv_as_divisor(ptr %a, i64 %n) {
+; CHECK-LABEL: define i64 @findlast_sdiv_iv_as_divisor(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 1, [[N_VEC]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1, i64 2, i64 3, i64 4>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 42)
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], -9223372036854775808
+; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i64 100, [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], i64 [[TMP8]], i64 -1
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[DONE:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[LOAD]], 42
+; CHECK-NEXT:    [[EXPR:%.*]] = sdiv i64 100, [[IV]]
+; CHECK-NEXT:    [[SEL]] = select i1 [[CMP]], i64 [[EXPR]], i64 [[RDX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXIT]], label %[[DONE]], label %[[LOOP]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       [[DONE]]:
+; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i64 [ [[SEL]], %[[LOOP]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[SEL_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i64 [ -1, %entry ], [ %sel, %loop ]
+  %gep = getelementptr inbounds i64, ptr %a, i64 %iv
+  %load = load i64, ptr %gep, align 8
+  %cmp = icmp eq i64 %load, 42
+  %expr = sdiv i64 100, %iv
+  %sel = select i1 %cmp, i64 %expr, i64 %rdx
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit = icmp eq i64 %iv.next, %n
+  br i1 %exit, label %done, label %loop
+
+done:
+  ret i64 %sel
+}
+
 ; Negative test: i8 reduction with IV expression. Should NOT create a wide (i64)
 ; IV in the vectorized code - the IV should be narrowed to i8 since only scalar
 ; i8 values are needed from it.
@@ -578,7 +645,7 @@ define i8 @findlast_narrow_i8_reduction(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i8 @llvm.experimental.vector.extract.last.active.v4i8(<4 x i8> [[TMP7]], <4 x i1> [[TMP6]], i8 -1)
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -598,7 +665,7 @@ define i8 @findlast_narrow_i8_reduction(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[SEL]] = select i1 [[CMP]], i8 [[EXPR]], i8 [[RDX]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXIT]], label %[[DONE]], label %[[LOOP]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXIT]], label %[[DONE]], label %[[LOOP]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       [[DONE]]:
 ; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i8 [ [[SEL]], %[[LOOP]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i8 [[SEL_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
index dd3ad4d01b465..bd35ef67961a1 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
@@ -162,7 +162,7 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) {
 ; IC1VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IC1VF4:       [[VECTOR_BODY]]:
 ; IC1VF4-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 12, i16 11, i16 10, i16 9>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT:    [[TMP4:%.*]] = phi <4 x i16> [ <i16 12, i16 11, i16 10, i16 9>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IC1VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; IC1VF4-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; IC1VF4-NEXT:    [[OFFSET_IDX:%.*]] = sub i16 12, [[DOTCAST]]
@@ -171,16 +171,16 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) {
 ; IC1VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 1
 ; IC1VF4-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; IC1VF4-NEXT:    [[TMP3:%.*]] = icmp ugt <4 x i16> [[REVERSE]], [[BROADCAST_SPLAT]]
-; IC1VF4-NEXT:    [[TMP4:%.*]] = add nsw <4 x i16> [[VEC_IND]], splat (i16 -1)
 ; IC1VF4-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[VEC_PHI]]
 ; IC1VF4-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[VEC_IND]], splat (i16 -4)
+; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[TMP4]], splat (i16 -4)
 ; IC1VF4-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
 ; IC1VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IC1VF4:       [[MIDDLE_BLOCK]]:
 ; IC1VF4-NEXT:    [[TMP7:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[TMP5]])
 ; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[TMP7]], 32767
-; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP7]], i16 0
+; IC1VF4-NEXT:    [[TMP8:%.*]] = add nsw i16 [[TMP7]], -1
+; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP8]], i16 0
 ; IC1VF4-NEXT:    br label %[[EXIT:.*]]
 ; IC1VF4:       [[EXIT]]:
 ; IC1VF4-NEXT:    ret i16 [[RDX_SELECT]]
@@ -379,28 +379,21 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) {
 ; IC4VF4-NEXT:    [[TMP101:%.*]] = icmp ugt <4 x i16> [[TMP51]], [[BROADCAST_SPLAT]]
 ; IC4VF4-NEXT:    [[TMP102:%.*]] = icmp ugt <4 x i16> [[TMP75]], [[BROADCAST_SPLAT]]
 ; IC4VF4-NEXT:    [[TMP103:%.*]] = icmp ugt <4 x i16> [[TMP99]], [[BROADCAST_SPLAT]]
-; IC4VF4-NEXT:    [[TMP104:%.*]] = add nsw <4 x i16> [[VEC_IND]], splat (i16 -1)
-; IC4VF4-NEXT:    [[TMP105:%.*]] = add nsw <4 x i16> [[STEP_ADD]], splat (i16 -1)
-; IC4VF4-NEXT:    [[TMP106:%.*]] = add nsw <4 x i16> [[STEP_ADD_2]], splat (i16 -1)
-; IC4VF4-NEXT:    [[TMP107:%.*]] = add nsw <4 x i16> [[STEP_ADD_3]], splat (i16 -1)
-; IC4VF4-NEXT:    [[TMP108]] = select <4 x i1> [[TMP100]], <4 x i16> [[TMP104]], <4 x i16> [[VEC_PHI]]
-; IC4VF4-NEXT:    [[TMP109]] = select <4 x i1> [[TMP101]], <4 x i16> [[TMP105]], <4 x i16> [[VEC_PHI1]]
-; IC4VF4-NEXT:    [[TMP110]] = select <4 x i1> [[TMP102]], <4 x i16> [[TMP106]], <4 x i16> [[VEC_PHI2]]
-; IC4VF4-NEXT:    [[TMP111]] = select <4 x i1> [[TMP103]], <4 x i16> [[TMP107]], <4 x i16> [[VEC_PHI3]]
+; IC4VF4-NEXT:    [[TMP108]] = select <4 x i1> [[TMP100]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]]
+; IC4VF4-NEXT:    [[TMP109]] = select <4 x i1> [[TMP101]], <4 x i16> [[STEP_ADD]], <4 x i16> [[VEC_PHI1]]
+; IC4VF4-NEXT:    [[TMP110]] = select <4 x i1> [[TMP102]], <4 x i16> [[STEP_ADD_2]], <4 x i16> [[VEC_PHI2]]
+; IC4VF4-NEXT:    [[TMP111]] = select <4 x i1> [[TMP103]], <4 x i16> [[STEP_ADD_3]], <4 x i16> [[VEC_PHI3]]
 ; IC4VF4-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[STEP_ADD_3]], splat (i16 -4)
 ; IC4VF4-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IC4VF4:       [[MIDDLE_BLOCK]]:
-; IC4VF4-NEXT:    [[TMP112:%.*]] = select <4 x i1> [[TMP0]], <4 x i16> [[TMP108]], <4 x i16> [[VEC_PHI]]
-; IC4VF4-NEXT:    [[TMP113:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> [[TMP109]], <4 x i16> [[VEC_PHI1]]
-; IC4VF4-NEXT:    [[TMP114:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> [[TMP110]], <4 x i16> [[VEC_PHI2]]
-; IC4VF4-NEXT:    [[TMP115:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP111]], <4 x i16> [[VEC_PHI3]]
-; IC4VF4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP112]], <4 x i16> [[TMP113]])
-; IC4VF4-NEXT:    [[RDX_MINMAX45:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX]], <4 x i16> [[TMP114]])
-; IC4VF4-NEXT:    [[RDX_MINMAX46:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX45]], <4 x i16> [[TMP115]])
+; IC4VF4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP108]], <4 x i16> [[TMP109]])
+; IC4VF4-NEXT:    [[RDX_MINMAX45:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX]], <4 x i16> [[TMP110]])
+; IC4VF4-NEXT:    [[RDX_MINMAX46:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX45]], <4 x i16> [[TMP111]])
 ; IC4VF4-NEXT:    [[TMP116:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[RDX_MINMAX46]])
 ; IC4VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[TMP116]], 32767
-; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP116]], i16 0
+; IC4VF4-NEXT:    [[TMP112:%.*]] = add nsw i16 [[TMP116]], -1
+; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP112]], i16 0
 ; IC4VF4-NEXT:    br label %[[EXIT:.*]]
 ; IC4VF4:       [[EXIT]]:
 ; IC4VF4-NEXT:    ret i16 [[RDX_SELECT]]
@@ -487,7 +480,7 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) {
 ; IC1VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IC1VF4:       [[VECTOR_BODY]]:
 ; IC1VF4-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 12, i16 11, i16 10, i16 9>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT:    [[TMP4:%.*]] = phi <4 x i16> [ <i16 12, i16 11, i16 10, i16 9>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IC1VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; IC1VF4-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; IC1VF4-NEXT:    [[OFFSET_IDX:%.*]] = sub i16 12, [[DOTCAST]]
@@ -496,16 +489,16 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) {
 ; IC1VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x half>, ptr [[TMP2]], align 1
 ; IC1VF4-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x half> [[WIDE_LOAD]], <4 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; IC1VF4-NEXT:    [[TMP3:%.*]] = fcmp ugt <4 x half> [[REVERSE]], [[BROADCAST_SPLAT]]
-; IC1VF4-NEXT:    [[TMP4:%.*]] = add nsw <4 x i16> [[VEC_IND]], splat (i16 -1)
 ; IC1VF4-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[VEC_PHI]]
 ; IC1VF4-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[VEC_IND]], splat (i16 -4)
+; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[TMP4]], splat (i16 -4)
 ; IC1VF4-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
 ; IC1VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IC1VF4:       [[MIDDLE_BLOCK]]:
 ; IC1VF4-NEXT:    [[TMP7:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[TMP5]])
 ; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[TMP7]], 32767
-; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP7]], i16 0
+; IC1VF4-NEXT:    [[TMP8:%.*]] = add nsw i16 [[TMP7]], -1
+; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP8]], i16 0
 ; IC1VF4-NEXT:    br label %[[EXIT:.*]]
 ; IC1VF4:       [[EXIT]]:
 ; IC1VF4-NEXT:    ret i16 [[RDX_SELECT]]
@@ -704,28 +697,21 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) {
 ; IC4VF4-NEXT:    [[TMP101:%.*]] = fcmp ugt <4 x half> [[TMP51]], [[BROADCAST_SPLAT]]
 ; IC4VF4-NEXT:    [[TMP102:%.*]] = fcmp ugt <4 x half> [[TMP75]], [[BROADCAST_SPLAT]]
 ; IC4VF4-NEXT:    [[TMP103:%.*]] = fcmp ugt <4 x half> [[TMP99]], [[BROADCAST_SPLAT]]
-; IC4VF4-NEXT:    [[TMP104:%.*]] = add nsw <4 x i16> [[VEC_IND]], splat (i16 -1)
-; IC4VF4-NEXT:    [[TMP105:%.*]] = add nsw <4 x i16> [[STEP_ADD]], splat (i16 -1)
-; IC4VF4-NEXT:    [[TMP106:%.*]] = add nsw <4 x i16> [[STEP_ADD_2]], splat (i16 -1)
-; IC4VF4-NEXT:    [[TMP107:%.*]] = add nsw <4 x i16> [[STEP_ADD_3]], splat (i16 -1)
-; IC4VF4-NEXT:    [[TMP108]] = select <4 x i1> [[TMP100]], <4 x i16> [[TMP104]], <4 x i16> [[VEC_PHI]]
-; IC4VF4-NEXT:    [[TMP109]] = select <4 x i1> [[TMP101]], <4 x i16> [[TMP105]], <4 x i16> [[VEC_PHI1]]
-; IC4VF4-NEXT:    [[TMP110]] = select <4 x i1> [[TMP102]], <4 x i16> [[TMP106]], <4 x i16> [[VEC_PHI2]]
-; IC4VF4-NEXT:    [[TMP111]] = select <4 x i1> [[TMP103]], <4 x i16> [[TMP107]], <4 x i16> [[VEC_PHI3]]
+; IC4VF4-NEXT:    [[TMP108]] = select <4 x i1> [[TMP100]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]]
+; IC4VF4-NEXT:    [[TMP109]] = select <4 x i1> [[TMP101]], <4 x i16> [[STEP_ADD]], <4 x i16> [[VEC_PHI1]]
+; IC4VF4-NEXT:    [[TMP110]] = select <4 x i1> [[TMP102]], <4 x i16> [[STEP_ADD_2]], <4 x i16> [[VEC_PHI2]]
+; IC4VF4-NEXT:    [[TMP111]] = select <4 x i1> [[TMP103]], <4 x i16> [[STEP_ADD_3]], <4 x i16> [[VEC_PHI3]]
 ; IC4VF4-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[STEP_ADD_3]], splat (i16 -4)
 ; IC4VF4-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IC4VF4:       [[MIDDLE_BLOCK]]:
-; IC4VF4-NEXT:    [[TMP112:%.*]] = select <4 x i1> [[TMP0]], <4 x i16> [[TMP108]], <4 x i16> [[VEC_PHI]]
-; IC4VF4-NEXT:    [[TMP113:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> [[TMP109]], <4 x i16> [[VEC_PHI1]]
-; IC4VF4-NEXT:    [[TMP114:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> [[TMP110]], <4 x i16> [[VEC_PHI2]]
-; IC4VF4-NEXT:    [[TMP115:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP111]], <4 x i16> [[VEC_PHI3]]
-; IC4VF4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP112]], <4 x i16> [[TMP113]])
-; IC4VF4-NEXT:    [[RDX_MINMAX45:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX]], <4 x i16> [[TMP114]])
-; IC4VF4-NEXT:    [[RDX_MINMAX46:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX45]], <4 x i16> [[TMP115]])
+; IC4VF4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP108]], <4 x i16> [[TMP109]])
+; IC4VF4-NEXT:    [[RDX_MINMAX45:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX]], <4 x i16> [[TMP110]])
+; IC4VF4-NEXT:    [[RDX_MINMAX46:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX45]], <4 x i16> [[TMP111]])
 ; IC4VF4-NEXT:    [[TMP116:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[RDX_MINMAX46]])
 ; IC4VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[TMP116]], 32767
-; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP116]], i16 0
+; IC4VF4-NEXT:    [[TMP112:%.*]] = add nsw i16 [[TMP116]], -1
+; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP112]], i16 0
 ; IC4VF4-NEXT:    br label %[[EXIT:.*]]
 ; IC4VF4:       [[EXIT]]:
 ; IC4VF4-NEXT:    ret i16 [[RDX_SELECT]]
@@ -1000,11 +986,14 @@ define i64 @select_decreasing_induction_icmp_iv_just_within_bounds(ptr %a, ptr %
 ; IC1VF4-NEXT:  [[ENTRY:.*:]]
 ; IC1VF4-NEXT:    br label %[[VECTOR_PH:.*]]
 ; IC1VF4:       [[VECTOR_PH]]:
+; IC1VF4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; IC1VF4-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; IC1VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IC1VF4:       [[VECTOR_BODY]]:
 ; IC1VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IC1VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -1, i64 -2, i64 -3, i64 -4>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT:    [[TMP9:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
 ; IC1VF4-NEXT:    [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
 ; IC1VF4-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
 ; IC1VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
@@ -1016,21 +1005,23 @@ define i64 @select_decreasing_induction_icmp_iv_just_within_bounds(ptr %a, ptr %
 ; IC1VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
 ; IC1VF4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; IC1VF4-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]]
-; IC1VF4-NEXT:    [[TMP9]] = select <4 x i1> [[TMP8]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]]
+; IC1VF4-NEXT:    [[TMP14:%.*]] = freeze <4 x i1> [[TMP8]]
+; IC1VF4-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
+; IC1VF4-NEXT:    [[TMP12]] = select i1 [[TMP11]], <4 x i1> [[TMP8]], <4 x i1> [[TMP9]]
+; IC1VF4-NEXT:    [[TMP13]] = select i1 [[TMP11]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]]
 ; IC1VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
 ; IC1VF4-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4
 ; IC1VF4-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IC1VF4:       [[MIDDLE_BLOCK]]:
-; IC1VF4-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[TMP9]])
-; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP11]], -1
-; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP11]], i64 [[RDX_START]]
+; IC1VF4-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; IC1VF4-NEXT:    [[TMP16:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP13]], <4 x i1> [[TMP12]], i64 [[TMP15]])
 ; IC1VF4-NEXT:    br label %[[SCALAR_PH:.*]]
 ; IC1VF4:       [[SCALAR_PH]]:
 ; IC1VF4-NEXT:    br label %[[LOOP:.*]]
 ; IC1VF4:       [[LOOP]]:
 ; IC1VF4-NEXT:    [[IV:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IC1VF4-NEXT:    [[RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ]
+; IC1VF4-NEXT:    [[RDX:%.*]] = phi i64 [ [[TMP16]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ]
 ; IC1VF4-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
 ; IC1VF4-NEXT:    [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]]
 ; IC1VF4-NEXT:    [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1
@@ -1049,73 +1040,42 @@ define i64 @select_decreasing_induction_icmp_iv_just_within_bounds(ptr %a, ptr %
 ; IC4VF4-NEXT:  [[ENTRY:.*:]]
 ; IC4VF4-NEXT:    br label %[[VECTOR_PH:.*]]
 ; IC4VF4:       [[VECTOR_PH]]:
+; IC4VF4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
+; IC4VF4-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; IC4VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IC4VF4:       [[VECTOR_BODY]]:
 ; IC4VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IC4VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -1, i64 -2, i64 -3, i64 -4>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC4VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[VECTOR_BODY]] ]
-; IC4VF4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP28:%.*]], %[[VECTOR_BODY]] ]
-; IC4VF4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ]
-; IC4VF4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP30:%.*]], %[[VECTOR_BODY]] ]
-; IC4VF4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
-; IC4VF4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 -4)
-; IC4VF4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 -4)
+; IC4VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT:    [[TMP2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
 ; IC4VF4-NEXT:    [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
 ; IC4VF4-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; IC4VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 -1)
-; IC4VF4-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 -1)
-; IC4VF4-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 -1)
 ; IC4VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
-; IC4VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -3
-; IC4VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -7
-; IC4VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -11
-; IC4VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -15
-; IC4VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
-; IC4VF4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
-; IC4VF4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
-; IC4VF4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
-; IC4VF4-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; IC4VF4-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD4]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; IC4VF4-NEXT:    [[REVERSE8:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; IC4VF4-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD6]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; IC4VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
-; IC4VF4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -3
-; IC4VF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -7
-; IC4VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -11
-; IC4VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -15
-; IC4VF4-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i8>, ptr [[TMP16]], align 1
-; IC4VF4-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
-; IC4VF4-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1
+; IC4VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -3
 ; IC4VF4-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
-; IC4VF4-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD10]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; IC4VF4-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; IC4VF4-NEXT:    [[REVERSE16:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD12]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; IC4VF4-NEXT:    [[REVERSE17:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; IC4VF4-NEXT:    [[TMP23:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE14]]
-; IC4VF4-NEXT:    [[TMP24:%.*]] = icmp sgt <4 x i8> [[REVERSE7]], [[REVERSE15]]
-; IC4VF4-NEXT:    [[TMP25:%.*]] = icmp sgt <4 x i8> [[REVERSE8]], [[REVERSE16]]
-; IC4VF4-NEXT:    [[TMP26:%.*]] = icmp sgt <4 x i8> [[REVERSE9]], [[REVERSE17]]
-; IC4VF4-NEXT:    [[TMP27]] = select <4 x i1> [[TMP23]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]]
-; IC4VF4-NEXT:    [[TMP28]] = select <4 x i1> [[TMP24]], <4 x i64> [[TMP2]], <4 x i64> [[VEC_PHI1]]
-; IC4VF4-NEXT:    [[TMP29]] = select <4 x i1> [[TMP25]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI2]]
-; IC4VF4-NEXT:    [[TMP30]] = select <4 x i1> [[TMP26]], <4 x i64> [[TMP4]], <4 x i64> [[VEC_PHI3]]
-; IC4VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 -4)
-; IC4VF4-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], -16
-; IC4VF4-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IC4VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; IC4VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 -3
+; IC4VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1
+; IC4VF4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT:    [[TMP26:%.*]] = icmp sgt <4 x i8> [[REVERSE17]], [[REVERSE2]]
+; IC4VF4-NEXT:    [[TMP10:%.*]] = freeze <4 x i1> [[TMP26]]
+; IC4VF4-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
+; IC4VF4-NEXT:    [[TMP12]] = select i1 [[TMP11]], <4 x i1> [[TMP26]], <4 x i1> [[TMP2]]
+; IC4VF4-NEXT:    [[TMP17]] = select i1 [[TMP11]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]]
+; IC4VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC4VF4-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4
+; IC4VF4-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IC4VF4:       [[MIDDLE_BLOCK]]:
-; IC4VF4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP27]], <4 x i64> [[TMP28]])
-; IC4VF4-NEXT:    [[RDX_MINMAX18:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP29]])
-; IC4VF4-NEXT:    [[RDX_MINMAX19:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[RDX_MINMAX18]], <4 x i64> [[TMP30]])
-; IC4VF4-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[RDX_MINMAX19]])
-; IC4VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP32]], -1
-; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP32]], i64 [[RDX_START]]
+; IC4VF4-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
+; IC4VF4-NEXT:    [[TMP16:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP17]], <4 x i1> [[TMP12]], i64 [[TMP19]])
 ; IC4VF4-NEXT:    br label %[[SCALAR_PH:.*]]
 ; IC4VF4:       [[SCALAR_PH]]:
 ; IC4VF4-NEXT:    br label %[[LOOP:.*]]
 ; IC4VF4:       [[LOOP]]:
-; IC4VF4-NEXT:    [[IV:%.*]] = phi i64 [ 15, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IC4VF4-NEXT:    [[RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ]
+; IC4VF4-NEXT:    [[IV:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC4VF4-NEXT:    [[RDX:%.*]] = phi i64 [ [[TMP16]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ]
 ; IC4VF4-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
 ; IC4VF4-NEXT:    [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]]
 ; IC4VF4-NEXT:    [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1
@@ -1240,11 +1200,11 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
 ; IC1VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IC1VF4:       [[VECTOR_BODY]]:
 ; IC1VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT:    [[TMP6:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
 ; IC1VF4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
-; IC1VF4-NEXT:    [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1)
-; IC1VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; IC1VF4-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[INDEX]]
+; IC1VF4-NEXT:    [[TMP4:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
 ; IC1VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
 ; IC1VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 -3
 ; IC1VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
@@ -1254,14 +1214,14 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
 ; IC1VF4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
 ; IC1VF4-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; IC1VF4-NEXT:    [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]]
+; IC1VF4-NEXT:    [[TMP14]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP6]], <4 x i64> [[VEC_PHI]]
 ; IC1VF4-NEXT:    [[TMP12]] = or <4 x i1> [[VEC_PHI1]], [[TMP11]]
-; IC1VF4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
 ; IC1VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[TMP6]], splat (i64 -4)
 ; IC1VF4-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IC1VF4-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; IC1VF4:       [[MIDDLE_BLOCK]]:
-; IC1VF4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP13]])
+; IC1VF4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP14]])
 ; IC1VF4-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]])
 ; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = freeze i1 [[TMP17]]
 ; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP15]], i64 [[RDX_START]]
@@ -1317,11 +1277,8 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
 ; IC4VF4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
 ; IC4VF4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 -4)
 ; IC4VF4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 -4)
-; IC4VF4-NEXT:    [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1)
-; IC4VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; IC4VF4-NEXT:    [[TMP17:%.*]] = add nsw <4 x i64> [[STEP_ADD]], splat (i64 -1)
-; IC4VF4-NEXT:    [[TMP6:%.*]] = add nsw <4 x i64> [[STEP_ADD_2]], splat (i64 -1)
-; IC4VF4-NEXT:    [[TMP18:%.*]] = add nsw <4 x i64> [[STEP_ADD_3]], splat (i64 -1)
+; IC4VF4-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[INDEX]]
+; IC4VF4-NEXT:    [[TMP4:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
 ; IC4VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
 ; IC4VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 -3
 ; IC4VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 -7
@@ -1352,14 +1309,14 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
 ; IC4VF4-NEXT:    [[TMP29:%.*]] = icmp sgt <4 x i64> [[REVERSE13]], [[REVERSE21]]
 ; IC4VF4-NEXT:    [[TMP20:%.*]] = icmp sgt <4 x i64> [[REVERSE14]], [[REVERSE22]]
 ; IC4VF4-NEXT:    [[TMP21:%.*]] = icmp sgt <4 x i64> [[REVERSE15]], [[REVERSE23]]
+; IC4VF4-NEXT:    [[TMP34]] = select <4 x i1> [[TMP28]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; IC4VF4-NEXT:    [[TMP35]] = select <4 x i1> [[TMP29]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; IC4VF4-NEXT:    [[TMP36]] = select <4 x i1> [[TMP20]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; IC4VF4-NEXT:    [[TMP37]] = select <4 x i1> [[TMP21]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
 ; IC4VF4-NEXT:    [[TMP30]] = or <4 x i1> [[VEC_PHI4]], [[TMP28]]
 ; IC4VF4-NEXT:    [[TMP31]] = or <4 x i1> [[VEC_PHI5]], [[TMP29]]
 ; IC4VF4-NEXT:    [[TMP32]] = or <4 x i1> [[VEC_PHI6]], [[TMP20]]
 ; IC4VF4-NEXT:    [[TMP33]] = or <4 x i1> [[VEC_PHI7]], [[TMP21]]
-; IC4VF4-NEXT:    [[TMP34]] = select <4 x i1> [[TMP28]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
-; IC4VF4-NEXT:    [[TMP35]] = select <4 x i1> [[TMP29]], <4 x i64> [[TMP17]], <4 x i64> [[VEC_PHI1]]
-; IC4VF4-NEXT:    [[TMP36]] = select <4 x i1> [[TMP20]], <4 x i64> [[TMP6]], <4 x i64> [[VEC_PHI2]]
-; IC4VF4-NEXT:    [[TMP37]] = select <4 x i1> [[TMP21]], <4 x i64> [[TMP18]], <4 x i64> [[VEC_PHI3]]
 ; IC4VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[STEP_ADD_3]], splat (i64 -4)
 ; IC4VF4-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
index 2e11fa158b8d1..6ebc81bfc7ee3 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
@@ -1978,9 +1978,9 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP5]])
-; CHECK-VF4IC1-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
-; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = freeze i1 [[TMP9]]
-; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP7]], i64 [[RDX_START]]
+; CHECK-VF4IC1-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-VF4IC1-NEXT:    [[TMP9:%.*]] = freeze i1 [[TMP10]]
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP9]], i64 [[TMP7]], i64 [[RDX_START]]
 ; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK-VF4IC1:       [[SCALAR_PH]]:
@@ -2048,31 +2048,31 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i64>, ptr [[TMP21]], align 8
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD8]], [[WIDE_LOAD12]]
-; CHECK-VF4IC4-NEXT:    [[TMP24:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD9]], [[WIDE_LOAD13]]
-; CHECK-VF4IC4-NEXT:    [[TMP25:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD10]], [[WIDE_LOAD14]]
+; CHECK-VF4IC4-NEXT:    [[TMP25:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD9]], [[WIDE_LOAD13]]
+; CHECK-VF4IC4-NEXT:    [[TMP26:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD10]], [[WIDE_LOAD14]]
 ; CHECK-VF4IC4-NEXT:    [[TMP13]] = or <4 x i1> [[VEC_PHI4]], [[TMP3]]
 ; CHECK-VF4IC4-NEXT:    [[TMP14]] = or <4 x i1> [[VEC_PHI5]], [[TMP10]]
-; CHECK-VF4IC4-NEXT:    [[TMP15]] = or <4 x i1> [[VEC_PHI6]], [[TMP24]]
-; CHECK-VF4IC4-NEXT:    [[TMP16]] = or <4 x i1> [[VEC_PHI7]], [[TMP25]]
+; CHECK-VF4IC4-NEXT:    [[TMP15]] = or <4 x i1> [[VEC_PHI6]], [[TMP25]]
+; CHECK-VF4IC4-NEXT:    [[TMP16]] = or <4 x i1> [[VEC_PHI7]], [[TMP26]]
 ; CHECK-VF4IC4-NEXT:    [[TMP17]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND1]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC4-NEXT:    [[TMP18]] = select <4 x i1> [[TMP10]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
-; CHECK-VF4IC4-NEXT:    [[TMP19]] = select <4 x i1> [[TMP24]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
-; CHECK-VF4IC4-NEXT:    [[TMP20]] = select <4 x i1> [[TMP25]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[TMP19]] = select <4 x i1> [[TMP25]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP20]] = select <4 x i1> [[TMP26]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP17]], <4 x i64> [[TMP18]])
-; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX16:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP19]])
-; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX18:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX16]], <4 x i64> [[TMP20]])
-; CHECK-VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX18]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX15:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP19]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX16:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX15]], <4 x i64> [[TMP20]])
+; CHECK-VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX16]])
 ; CHECK-VF4IC4-NEXT:    [[BIN_RDX:%.*]] = or <4 x i1> [[TMP14]], [[TMP13]]
-; CHECK-VF4IC4-NEXT:    [[BIN_RDX18:%.*]] = or <4 x i1> [[TMP15]], [[BIN_RDX]]
-; CHECK-VF4IC4-NEXT:    [[BIN_RDX17:%.*]] = or <4 x i1> [[TMP16]], [[BIN_RDX18]]
-; CHECK-VF4IC4-NEXT:    [[TMP23:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX17]])
-; CHECK-VF4IC4-NEXT:    [[TMP26:%.*]] = freeze i1 [[TMP23]]
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP26]], i64 [[TMP22]], i64 [[RDX_START]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX17:%.*]] = or <4 x i1> [[TMP15]], [[BIN_RDX]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX18:%.*]] = or <4 x i1> [[TMP16]], [[BIN_RDX17]]
+; CHECK-VF4IC4-NEXT:    [[TMP23:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX18]])
+; CHECK-VF4IC4-NEXT:    [[TMP24:%.*]] = freeze i1 [[TMP23]]
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP24]], i64 [[TMP22]], i64 [[RDX_START]]
 ; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK-VF4IC4:       [[SCALAR_PH]]:
@@ -2158,13 +2158,13 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
 ; CHECK-VF1IC4-NEXT:    br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[COND]], i64 [[TMP32]])
-; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX9:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP33]])
-; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX11:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX9]], i64 [[TMP34]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX8:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP33]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX9:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX8]], i64 [[TMP34]])
 ; CHECK-VF1IC4-NEXT:    [[BIN_RDX:%.*]] = or i1 [[TMP28]], [[TMP27]]
 ; CHECK-VF1IC4-NEXT:    [[BIN_RDX10:%.*]] = or i1 [[TMP29]], [[BIN_RDX]]
 ; CHECK-VF1IC4-NEXT:    [[BIN_RDX11:%.*]] = or i1 [[TMP30]], [[BIN_RDX10]]
-; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = freeze i1 [[BIN_RDX11]]
-; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX11]], i64 [[RDX_START]]
+; CHECK-VF1IC4-NEXT:    [[TMP36:%.*]] = freeze i1 [[BIN_RDX11]]
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP36]], i64 [[RDX_MINMAX9]], i64 [[RDX_START]]
 ; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK-VF1IC4:       [[SCALAR_PH]]:
@@ -2177,10 +2177,10 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
 ; CHECK-VF1IC4-NEXT:    [[IV_I1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY1]] ], [ [[BC_RESUME_VAL12]], %[[SCALAR_PH]] ]
 ; CHECK-VF1IC4-NEXT:    [[RDX1:%.*]] = phi i64 [ [[COND1:%.*]], %[[FOR_BODY1]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
 ; CHECK-VF1IC4-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I1]]
-; CHECK-VF1IC4-NEXT:    [[TMP36:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP37:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
 ; CHECK-VF1IC4-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I1]]
-; CHECK-VF1IC4-NEXT:    [[TMP37:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
-; CHECK-VF1IC4-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP36]], [[TMP37]]
+; CHECK-VF1IC4-NEXT:    [[TMP38:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP37]], [[TMP38]]
 ; CHECK-VF1IC4-NEXT:    [[COND1]] = select i1 [[CMP3]], i64 [[IV_J1]], i64 [[RDX1]]
 ; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I1]], 1
 ; CHECK-VF1IC4-NEXT:    [[INC3]] = add nsw i64 [[IV_J1]], 1
@@ -2246,9 +2246,9 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP6]])
-; CHECK-VF4IC1-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = freeze i1 [[TMP10]]
-; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 [[RDX_START]]
+; CHECK-VF4IC1-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC1-NEXT:    [[TMP10:%.*]] = freeze i1 [[TMP11]]
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP10]], i64 [[TMP8]], i64 [[RDX_START]]
 ; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK-VF4IC1:       [[SCALAR_PH]]:
@@ -2318,31 +2318,31 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i64>, ptr [[TMP22]], align 8
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD3]]
 ; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD8]], [[WIDE_LOAD12]]
-; CHECK-VF4IC4-NEXT:    [[TMP25:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD9]], [[WIDE_LOAD13]]
-; CHECK-VF4IC4-NEXT:    [[TMP26:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD10]], [[WIDE_LOAD14]]
+; CHECK-VF4IC4-NEXT:    [[TMP26:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD9]], [[WIDE_LOAD13]]
+; CHECK-VF4IC4-NEXT:    [[TMP27:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD10]], [[WIDE_LOAD14]]
 ; CHECK-VF4IC4-NEXT:    [[TMP14]] = or <4 x i1> [[VEC_PHI4]], [[TMP4]]
 ; CHECK-VF4IC4-NEXT:    [[TMP15]] = or <4 x i1> [[VEC_PHI5]], [[TMP11]]
-; CHECK-VF4IC4-NEXT:    [[TMP16]] = or <4 x i1> [[VEC_PHI6]], [[TMP25]]
-; CHECK-VF4IC4-NEXT:    [[TMP17]] = or <4 x i1> [[VEC_PHI7]], [[TMP26]]
+; CHECK-VF4IC4-NEXT:    [[TMP16]] = or <4 x i1> [[VEC_PHI6]], [[TMP26]]
+; CHECK-VF4IC4-NEXT:    [[TMP17]] = or <4 x i1> [[VEC_PHI7]], [[TMP27]]
 ; CHECK-VF4IC4-NEXT:    [[TMP18]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_IND1]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC4-NEXT:    [[TMP19]] = select <4 x i1> [[TMP11]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
-; CHECK-VF4IC4-NEXT:    [[TMP20]] = select <4 x i1> [[TMP25]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
-; CHECK-VF4IC4-NEXT:    [[TMP21]] = select <4 x i1> [[TMP26]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[TMP20]] = select <4 x i1> [[TMP26]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP21]] = select <4 x i1> [[TMP27]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP18]], <4 x i64> [[TMP19]])
-; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX16:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP20]])
-; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX18:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX16]], <4 x i64> [[TMP21]])
-; CHECK-VF4IC4-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX18]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX15:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP20]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX16:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX15]], <4 x i64> [[TMP21]])
+; CHECK-VF4IC4-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX16]])
 ; CHECK-VF4IC4-NEXT:    [[BIN_RDX:%.*]] = or <4 x i1> [[TMP15]], [[TMP14]]
-; CHECK-VF4IC4-NEXT:    [[BIN_RDX18:%.*]] = or <4 x i1> [[TMP16]], [[BIN_RDX]]
-; CHECK-VF4IC4-NEXT:    [[BIN_RDX17:%.*]] = or <4 x i1> [[TMP17]], [[BIN_RDX18]]
-; CHECK-VF4IC4-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX17]])
-; CHECK-VF4IC4-NEXT:    [[TMP27:%.*]] = freeze i1 [[TMP24]]
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP27]], i64 [[TMP23]], i64 [[RDX_START]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX17:%.*]] = or <4 x i1> [[TMP16]], [[BIN_RDX]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX18:%.*]] = or <4 x i1> [[TMP17]], [[BIN_RDX17]]
+; CHECK-VF4IC4-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX18]])
+; CHECK-VF4IC4-NEXT:    [[TMP25:%.*]] = freeze i1 [[TMP24]]
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP25]], i64 [[TMP23]], i64 [[RDX_START]]
 ; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK-VF4IC4:       [[SCALAR_PH]]:
@@ -2423,13 +2423,13 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
 ; CHECK-VF1IC4-NEXT:    br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[COND]], i64 [[TMP30]])
-; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX9:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP31]])
-; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX11:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX9]], i64 [[TMP32]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX8:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP31]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX9:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX8]], i64 [[TMP32]])
 ; CHECK-VF1IC4-NEXT:    [[BIN_RDX:%.*]] = or i1 [[TMP26]], [[TMP25]]
 ; CHECK-VF1IC4-NEXT:    [[BIN_RDX10:%.*]] = or i1 [[TMP27]], [[BIN_RDX]]
 ; CHECK-VF1IC4-NEXT:    [[BIN_RDX11:%.*]] = or i1 [[TMP28]], [[BIN_RDX10]]
-; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = freeze i1 [[BIN_RDX11]]
-; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX11]], i64 [[RDX_START]]
+; CHECK-VF1IC4-NEXT:    [[TMP34:%.*]] = freeze i1 [[BIN_RDX11]]
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP34]], i64 [[RDX_MINMAX9]], i64 [[RDX_START]]
 ; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP5]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK-VF1IC4:       [[SCALAR_PH]]:
@@ -2440,10 +2440,10 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
 ; CHECK-VF1IC4-NEXT:    [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY1]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; CHECK-VF1IC4-NEXT:    [[RDX1:%.*]] = phi i64 [ [[COND1:%.*]], %[[FOR_BODY1]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
 ; CHECK-VF1IC4-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
-; CHECK-VF1IC4-NEXT:    [[TMP34:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP35:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
 ; CHECK-VF1IC4-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV1]]
-; CHECK-VF1IC4-NEXT:    [[TMP35:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
-; CHECK-VF1IC4-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP34]], [[TMP35]]
+; CHECK-VF1IC4-NEXT:    [[TMP36:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP35]], [[TMP36]]
 ; CHECK-VF1IC4-NEXT:    [[COND1]] = select i1 [[CMP3]], i64 [[IV1]], i64 [[RDX1]]
 ; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
@@ -2490,20 +2490,20 @@ define i64 @select_icmp_xor_not_iv(ptr %a, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4IC1:       [[VECTOR_BODY]]:
 ; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = xor <4 x i64> [[VEC_IND]], splat (i64 -1)
 ; CHECK-VF4IC1-NEXT:    [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i64> [[TMP2]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[TMP2]], splat (i64 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
-; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP3]])
-; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP5]], 9223372036854775807
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP3]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP7]], -9223372036854775808
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP7]], -1
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP5]], i64 0
 ; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
@@ -2544,10 +2544,10 @@ define i64 @select_icmp_xor_not_iv(ptr %a, i64 %n) {
 ; CHECK-VF4IC4:       [[VECTOR_BODY]]:
 ; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD:%.*]] = add nuw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add nuw <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add nuw <4 x i64> [[STEP_ADD_2]], splat (i64 4)
@@ -2563,24 +2563,21 @@ define i64 @select_icmp_xor_not_iv(ptr %a, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD4]], zeroinitializer
 ; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD5]], zeroinitializer
 ; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD6]], zeroinitializer
-; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = xor <4 x i64> [[VEC_IND]], splat (i64 -1)
-; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = xor <4 x i64> [[STEP_ADD]], splat (i64 -1)
-; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = xor <4 x i64> [[STEP_ADD_2]], splat (i64 -1)
-; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = xor <4 x i64> [[STEP_ADD_3]], splat (i64 -1)
-; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP8]], <4 x i64> [[VEC_PHI]]
-; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP9]], <4 x i64> [[VEC_PHI1]]
-; CHECK-VF4IC4-NEXT:    [[TMP14]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP10]], <4 x i64> [[VEC_PHI2]]
-; CHECK-VF4IC4-NEXT:    [[TMP15]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP11]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[TMP8]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT:    [[TMP9]] = select <4 x i1> [[TMP5]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT:    [[TMP10]] = select <4 x i1> [[TMP6]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP11]] = select <4 x i1> [[TMP7]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD_3]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
-; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP12]], <4 x i64> [[TMP13]])
-; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX7:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP14]])
-; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX8:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[RDX_MINMAX7]], <4 x i64> [[TMP15]])
-; CHECK-VF4IC4-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[RDX_MINMAX8]])
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP17]], 9223372036854775807
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP8]], <4 x i64> [[TMP9]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX7:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP10]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX8:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX7]], <4 x i64> [[TMP11]])
+; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX8]])
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP13]], -9223372036854775808
+; CHECK-VF4IC4-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP13]], -1
 ; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP17]], i64 0
 ; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]

>From de3b1d5181c7f6492b60c9c57981f5982044ece4 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 3 Mar 2026 19:45:13 +0000
Subject: [PATCH 2/3] !fixup address latest comments, thanks

---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 59 +++++++++----------
 1 file changed, 27 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1efc0ef8df661..a47236f472b13 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5674,33 +5674,31 @@ getExpressionIV(VPValue *V, VPRegionBlock *VectorLoopRegion) {
   if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()))
     return nullptr;
 
-  VPWidenIntOrFpInductionRecipe *WideIV = nullptr;
-  for (VPValue *Op : BinOp->operands()) {
-    // Operands must be a single IV and others must be loop invariant.
-    auto *Cand = dyn_cast<VPWidenIntOrFpInductionRecipe>(Op);
-    if (Cand) {
-      if (WideIV)
-        return nullptr;
-      WideIV = Cand;
-      continue;
-    }
-    if (!Op->isDefinedOutsideLoopRegions())
+  VPValue *Op0 = BinOp->getOperand(0);
+  VPValue *Op1 = BinOp->getOperand(1);
+  if (!isa<VPWidenIntOrFpInductionRecipe>(Op0))
+    std::swap(Op0, Op1);
+
+  if (!Op1->isDefinedOutsideLoopRegions())
       return nullptr;
-  }
-  return WideIV;
+
+  return dyn_cast<VPWidenIntOrFpInductionRecipe>(Op0);
 }
 
-/// Create a scalar version of the binary op \p V in the middle block,
+/// Create a scalar version of the binary op \p V and sink before \p InsertPt,
 /// replacing \p WidenIV with \p ScalarIV.
 static VPValue *sinkBinOpToMiddleBlock(VPValue *V, VPValue *ScalarIV,
                                        VPWidenIntOrFpInductionRecipe *WidenIV,
-                                       VPBuilder &Builder) {
-  auto *BinOp = cast<VPWidenRecipe>(V->getDefiningRecipe());
-  auto Ops = to_vector(map_range(BinOp->operands(), [&](VPValue *Op) {
-    return Op == WidenIV ? ScalarIV : Op;
-  }));
-  return Builder.createNaryOp(BinOp->getOpcode(), Ops, VPIRFlags(*BinOp),
-                              BinOp->getDebugLoc());
+                                       VPBasicBlock::iterator InsertPt) {
+  auto *BinOp = V->getDefiningRecipe()->clone();
+  if (BinOp->getOperand(0) == WidenIV)
+    BinOp->setOperand(0, ScalarIV);
+  else {
+    assert(BinOp->getOperand(1) ==  WidenIV && "one operand must be WideIV");
+    BinOp->setOperand(1, ScalarIV);
+  }
+  BinOp->insertBefore(*InsertPt->getParent(), InsertPt);
+  return cast<VPWidenRecipe>(BinOp);
 }
 
 void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
@@ -5749,17 +5747,14 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
                                     m_VPValue(FalseVal))))
       continue;
 
-    // The non-phi operand of the select is the IV (or expression of IV).
-    assert(is_contained(BackedgeVal->getDefiningRecipe()->operands(), PhiR));
-    VPValue *IV = TrueVal == PhiR ? FalseVal : TrueVal;
-
-    // Check if IV is a simple expression of a widened IV.
-    // If so, we can track the underlying IV and sink the expression.
-    VPWidenIntOrFpInductionRecipe *ExpressionIV =
-        getExpressionIV(IV, VectorLoopRegion);
-    VPValue *IVToTrack = ExpressionIV ? ExpressionIV : IV;
+    // The non-phi operand of the select is the IV or and expression of IV.
+    assert(is_contained(CondSelect->getDefiningRecipe()->operands(), PhiR));
+    VPValue *ExprOfIV = TrueVal == PhiR ? FalseVal : TrueVal;
 
-    const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(IVToTrack, PSE, &L);
+    // Check if IV is a simple expression of a widened IV. If so, we can track the underlying IV and sink the expression.
+    auto *ExpressionIV =
+        getExpressionIV(ExprOfIV, VectorLoopRegion);
+    const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(ExpressionIV ? ExpressionIV : ExprOfIV, PSE, &L);
     const SCEV *Step;
     if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step))))
       continue;
@@ -5830,7 +5825,7 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
       VPValue *ResultVal = ReducedIV;
       if (ExpressionIV)
         ResultVal =
-            sinkBinOpToMiddleBlock(IV, ReducedIV, ExpressionIV, MiddleBuilder);
+            sinkBinOpToMiddleBlock(ExprOfIV, ReducedIV, ExpressionIV, MiddleBuilder.getInsertPoint());
 
       NewRdxResult =
           MiddleBuilder.createSelect(Cmp, ResultVal, StartVPV, ExitDL);

>From 82f295773f1f76e246a0f8033ea19b4515ae566e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 3 Mar 2026 20:46:21 +0000
Subject: [PATCH 3/3] !fixup address latest comments, thanks

---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  85 +++++---
 .../find-last-iv-sinkable-expr.ll             |  70 +++----
 .../LoopVectorize/iv-select-cmp-decreasing.ll | 193 ++++++++++--------
 .../Transforms/LoopVectorize/iv-select-cmp.ll | 106 +++++-----
 4 files changed, 250 insertions(+), 204 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a47236f472b13..1b55c02461447 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5680,7 +5680,7 @@ getExpressionIV(VPValue *V, VPRegionBlock *VectorLoopRegion) {
     std::swap(Op0, Op1);
 
   if (!Op1->isDefinedOutsideLoopRegions())
-      return nullptr;
+    return nullptr;
 
   return dyn_cast<VPWidenIntOrFpInductionRecipe>(Op0);
 }
@@ -5694,7 +5694,7 @@ static VPValue *sinkBinOpToMiddleBlock(VPValue *V, VPValue *ScalarIV,
   if (BinOp->getOperand(0) == WidenIV)
     BinOp->setOperand(0, ScalarIV);
   else {
-    assert(BinOp->getOperand(1) ==  WidenIV && "one operand must be WideIV");
+    assert(BinOp->getOperand(1) == WidenIV && "one operand must be WideIV");
     BinOp->setOperand(1, ScalarIV);
   }
   BinOp->insertBefore(*InsertPt->getParent(), InsertPt);
@@ -5751,10 +5751,11 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
     assert(is_contained(CondSelect->getDefiningRecipe()->operands(), PhiR));
     VPValue *ExprOfIV = TrueVal == PhiR ? FalseVal : TrueVal;
 
-    // Check if IV is a simple expression of a widened IV. If so, we can track the underlying IV and sink the expression.
-    auto *ExpressionIV =
-        getExpressionIV(ExprOfIV, VectorLoopRegion);
-    const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(ExpressionIV ? ExpressionIV : ExprOfIV, PSE, &L);
+    // Check if IV is a simple expression of a widened IV. If so, we can track
+    // the underlying IV and sink the expression.
+    auto *ExpressionIV = getExpressionIV(ExprOfIV, VectorLoopRegion);
+    const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
+        ExpressionIV ? ExpressionIV : ExprOfIV, PSE, &L);
     const SCEV *Step;
     if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step))))
       continue;
@@ -5763,21 +5764,40 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
     if (!SE.isKnownNonZero(Step))
       continue;
 
-    // Positive step means we need UMax/SMax to find the last IV value, and
-    // UMin/SMin otherwise.
-    bool UseMax = SE.isKnownPositive(Step);
-    bool UseSigned = true;
-    std::optional<APInt> SentinelVal =
-        CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/true);
-    if (!SentinelVal) {
-      SentinelVal = CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/false);
-      UseSigned = false;
+    bool UseMax;
+    bool UseSigned;
+    std::optional<APInt> SentinelVal;
+    auto TryToGetSentinel = [&](const SCEV *IVSCEV, const SCEV *Step) {
+      // Positive step means we need UMax/SMax to find the last IV value, and
+      // UMin/SMin otherwise.
+      UseMax = SE.isKnownPositive(Step);
+      UseSigned = true;
+      SentinelVal = CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/true);
+      if (!SentinelVal) {
+        SentinelVal = CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/false);
+        UseSigned = false;
+      }
+    };
+
+    // First, check if we can vectorize with sentinel value with ExpressionIV if
+    // available and ExprOfIV otherwise.
+    TryToGetSentinel(IVSCEV, Step);
+    if (!SentinelVal && ExpressionIV) {
+      // If we weren't able to use a sentinel with ExpressionIV, try if it is
+      // possible with ExprOfIV.
+      const SCEV *ExprOfIVSCEV =
+          vputils::getSCEVExprForVPValue(ExprOfIV, PSE, &L);
+      if (match(ExprOfIVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
+        TryToGetSentinel(ExprOfIVSCEV, Step);
+        IVSCEV = ExprOfIVSCEV;
+      }
     }
 
     // If no sentinel was found, fall back to a boolean AnyOf reduction to track
     // if the condition was ever true. Requires the IV to not wrap, otherwise we
     // cannot use min/max.
     if (!SentinelVal) {
+      // TODO: Support expression sinking with boolean any-of tracking.
       auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
       if (AR->hasNoSignedWrap())
         UseSigned = true;
@@ -5791,14 +5811,15 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
         BackedgeVal,
         match_fn(m_VPInstruction<VPInstruction::ComputeReductionResult>())));
 
-    VPValue *NewBackedgeVal = CondSelect;
     // If IV is an expression, create a new select that tracks the underlying
     // IV directly and update the backedge value.
     if (ExpressionIV) {
-      auto *SelectR = BackedgeVal->getDefiningRecipe();
+      auto *SelectR = CondSelect->getDefiningRecipe();
       VPBuilder LoopBuilder(SelectR);
-      NewBackedgeVal = LoopBuilder.createSelect(Cond, ExpressionIV, PhiR,
-                                                SelectR->getDebugLoc());
+      if (TrueVal == PhiR)
+        Cond = LoopBuilder.createNot(Cond);
+      CondSelect = LoopBuilder.createSelect(Cond, ExpressionIV, PhiR,
+                                            SelectR->getDebugLoc());
     }
 
     // Create the reduction result in the middle block using sentinel directly.
@@ -5810,7 +5831,13 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
     DebugLoc ExitDL = RdxResult->getDebugLoc();
     VPBuilder MiddleBuilder(RdxResult);
     VPValue *ReducedIV = MiddleBuilder.createNaryOp(
-        VPInstruction::ComputeReductionResult, NewBackedgeVal, Flags, ExitDL);
+        VPInstruction::ComputeReductionResult, CondSelect, Flags, ExitDL);
+
+    // If IV was an expression, sink the expression to the middle block.
+    VPValue *LoopExitingVal = ReducedIV;
+    if (ExpressionIV)
+      LoopExitingVal = sinkBinOpToMiddleBlock(ExprOfIV, ReducedIV, ExpressionIV,
+                                              MiddleBuilder.getInsertPoint());
 
     VPValue *NewRdxResult;
     VPValue *StartVPV = PhiR->getStartValue();
@@ -5820,15 +5847,8 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
       VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
       auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
                                            Sentinel, ExitDL);
-
-      // If IV was an expression, sink the expression to the middle block.
-      VPValue *ResultVal = ReducedIV;
-      if (ExpressionIV)
-        ResultVal =
-            sinkBinOpToMiddleBlock(ExprOfIV, ReducedIV, ExpressionIV, MiddleBuilder.getInsertPoint());
-
       NewRdxResult =
-          MiddleBuilder.createSelect(Cmp, ResultVal, StartVPV, ExitDL);
+          MiddleBuilder.createSelect(Cmp, LoopExitingVal, StartVPV, ExitDL);
       StartVPV = Sentinel;
     } else {
       // Introduce a boolean AnyOf reduction to track if the condition was ever
@@ -5846,9 +5866,9 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
       VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, AnyOfCond);
       AnyOfPhi->setOperand(1, OrVal);
 
-      NewRdxResult =
-          MiddleBuilder.createNaryOp(VPInstruction::ComputeAnyOfResult,
-                                     {StartVPV, ReducedIV, OrVal}, {}, ExitDL);
+      NewRdxResult = MiddleBuilder.createNaryOp(
+          VPInstruction::ComputeAnyOfResult, {StartVPV, LoopExitingVal, OrVal},
+          {}, ExitDL);
 
       // Initialize the IV reduction phi with the neutral element, not the
       // original start value, to ensure correct min/max reduction results.
@@ -5860,8 +5880,7 @@ void VPlanTransforms::optimizeFindIVReductions(VPlan &Plan,
 
     auto *NewPhiR = new VPReductionPHIRecipe(
         cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
-        *NewBackedgeVal, RdxUnordered{1}, {},
-        PhiR->hasUsesOutsideReductionChain());
+        *CondSelect, RdxUnordered{1}, {}, PhiR->hasUsesOutsideReductionChain());
     NewPhiR->insertBefore(PhiR);
     PhiR->replaceAllUsesWith(NewPhiR);
     PhiR->eraseFromParent();
diff --git a/llvm/test/Transforms/LoopVectorize/find-last-iv-sinkable-expr.ll b/llvm/test/Transforms/LoopVectorize/find-last-iv-sinkable-expr.ll
index 10404d6ae5aa6..9c4b552158c3e 100644
--- a/llvm/test/Transforms/LoopVectorize/find-last-iv-sinkable-expr.ll
+++ b/llvm/test/Transforms/LoopVectorize/find-last-iv-sinkable-expr.ll
@@ -16,21 +16,21 @@ define i64 @findlast_shl_expr(ptr %a, i64 %n) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP2]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[TMP3]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], -9223372036854775808
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP6]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], i64 [[TMP10]], i64 -1
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP4]], -9223372036854775808
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 -1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[DONE:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -83,21 +83,21 @@ define i64 @findlast_mul_expr(ptr %a, i64 %n) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP2]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[TMP3]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], -9223372036854775808
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP6]], 3
-; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], i64 [[TMP10]], i64 -1
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP4]], -9223372036854775808
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 -1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[DONE:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -228,21 +228,21 @@ define i64 @findlast_or_expr(ptr %a, i64 %n) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP2]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[TMP3]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], -9223372036854775808
-; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], i64 [[TMP10]], i64 -1
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP4]], -9223372036854775808
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 -1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[DONE:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -302,7 +302,7 @@ define i64 @findlast_non_canonical_iv_with_expr(ptr %a, i64 %n) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 10, i64 12, i64 14, i64 16>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 10, i64 12, i64 14, i64 16>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 10, [[TMP5]]
@@ -323,15 +323,15 @@ define i64 @findlast_non_canonical_iv_with_expr(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 2
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 3
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq <4 x i64> [[TMP21]], splat (i64 42)
-; CHECK-NEXT:    [[TMP23]] = select <4 x i1> [[TMP22]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP23]] = select <4 x i1> [[TMP22]], <4 x i64> [[VEC_IND1]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND1]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP23]])
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne i64 [[TMP30]], -9223372036854775808
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP30]], 100
+; CHECK-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP23]])
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP25]], 100
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne i64 [[TMP25]], -9223372036854775808
 ; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP26]], i64 [[TMP27]], i64 -1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[DONE:.*]], label %[[SCALAR_PH]]
@@ -573,9 +573,9 @@ define i64 @findlast_sdiv_iv_as_divisor(ptr %a, i64 %n) {
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], -9223372036854775808
-; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i64 100, [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], i64 [[TMP8]], i64 -1
+; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i64 100, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], -9223372036854775808
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 -1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[DONE:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -645,7 +645,7 @@ define i8 @findlast_narrow_i8_reduction(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i8 @llvm.experimental.vector.extract.last.active.v4i8(<4 x i8> [[TMP7]], <4 x i1> [[TMP6]], i8 -1)
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -665,7 +665,7 @@ define i8 @findlast_narrow_i8_reduction(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[SEL]] = select i1 [[CMP]], i8 [[EXPR]], i8 [[RDX]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXIT]], label %[[DONE]], label %[[LOOP]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXIT]], label %[[DONE]], label %[[LOOP]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       [[DONE]]:
 ; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i8 [ [[SEL]], %[[LOOP]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i8 [[SEL_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
index bd35ef67961a1..6ef126156db47 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
@@ -162,7 +162,7 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) {
 ; IC1VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IC1VF4:       [[VECTOR_BODY]]:
 ; IC1VF4-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1VF4-NEXT:    [[TMP4:%.*]] = phi <4 x i16> [ <i16 12, i16 11, i16 10, i16 9>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 12, i16 11, i16 10, i16 9>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IC1VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; IC1VF4-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; IC1VF4-NEXT:    [[OFFSET_IDX:%.*]] = sub i16 12, [[DOTCAST]]
@@ -171,15 +171,15 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) {
 ; IC1VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 1
 ; IC1VF4-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; IC1VF4-NEXT:    [[TMP3:%.*]] = icmp ugt <4 x i16> [[REVERSE]], [[BROADCAST_SPLAT]]
-; IC1VF4-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[VEC_PHI]]
+; IC1VF4-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]]
 ; IC1VF4-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[TMP4]], splat (i16 -4)
+; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[VEC_IND]], splat (i16 -4)
 ; IC1VF4-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
 ; IC1VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IC1VF4:       [[MIDDLE_BLOCK]]:
 ; IC1VF4-NEXT:    [[TMP7:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[TMP5]])
-; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[TMP7]], 32767
 ; IC1VF4-NEXT:    [[TMP8:%.*]] = add nsw i16 [[TMP7]], -1
+; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[TMP7]], 32767
 ; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP8]], i16 0
 ; IC1VF4-NEXT:    br label %[[EXIT:.*]]
 ; IC1VF4:       [[EXIT]]:
@@ -196,10 +196,10 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) {
 ; IC4VF4:       [[VECTOR_BODY]]:
 ; IC4VF4-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE44:.*]] ]
 ; IC4VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 12, i16 11, i16 10, i16 9>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
-; IC4VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP108:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
-; IC4VF4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP109:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
-; IC4VF4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP110:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
-; IC4VF4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP111:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
+; IC4VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP112:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
+; IC4VF4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP113:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
+; IC4VF4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP114:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
+; IC4VF4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP115:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
 ; IC4VF4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 -4)
 ; IC4VF4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 -4)
 ; IC4VF4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i16> [[STEP_ADD_2]], splat (i16 -4)
@@ -379,21 +379,21 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) {
 ; IC4VF4-NEXT:    [[TMP101:%.*]] = icmp ugt <4 x i16> [[TMP51]], [[BROADCAST_SPLAT]]
 ; IC4VF4-NEXT:    [[TMP102:%.*]] = icmp ugt <4 x i16> [[TMP75]], [[BROADCAST_SPLAT]]
 ; IC4VF4-NEXT:    [[TMP103:%.*]] = icmp ugt <4 x i16> [[TMP99]], [[BROADCAST_SPLAT]]
-; IC4VF4-NEXT:    [[TMP108]] = select <4 x i1> [[TMP100]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]]
-; IC4VF4-NEXT:    [[TMP109]] = select <4 x i1> [[TMP101]], <4 x i16> [[STEP_ADD]], <4 x i16> [[VEC_PHI1]]
-; IC4VF4-NEXT:    [[TMP110]] = select <4 x i1> [[TMP102]], <4 x i16> [[STEP_ADD_2]], <4 x i16> [[VEC_PHI2]]
-; IC4VF4-NEXT:    [[TMP111]] = select <4 x i1> [[TMP103]], <4 x i16> [[STEP_ADD_3]], <4 x i16> [[VEC_PHI3]]
+; IC4VF4-NEXT:    [[TMP112]] = select <4 x i1> [[TMP100]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]]
+; IC4VF4-NEXT:    [[TMP113]] = select <4 x i1> [[TMP101]], <4 x i16> [[STEP_ADD]], <4 x i16> [[VEC_PHI1]]
+; IC4VF4-NEXT:    [[TMP114]] = select <4 x i1> [[TMP102]], <4 x i16> [[STEP_ADD_2]], <4 x i16> [[VEC_PHI2]]
+; IC4VF4-NEXT:    [[TMP115]] = select <4 x i1> [[TMP103]], <4 x i16> [[STEP_ADD_3]], <4 x i16> [[VEC_PHI3]]
 ; IC4VF4-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[STEP_ADD_3]], splat (i16 -4)
 ; IC4VF4-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IC4VF4:       [[MIDDLE_BLOCK]]:
-; IC4VF4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP108]], <4 x i16> [[TMP109]])
-; IC4VF4-NEXT:    [[RDX_MINMAX45:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX]], <4 x i16> [[TMP110]])
-; IC4VF4-NEXT:    [[RDX_MINMAX46:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX45]], <4 x i16> [[TMP111]])
+; IC4VF4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP112]], <4 x i16> [[TMP113]])
+; IC4VF4-NEXT:    [[RDX_MINMAX45:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX]], <4 x i16> [[TMP114]])
+; IC4VF4-NEXT:    [[RDX_MINMAX46:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX45]], <4 x i16> [[TMP115]])
 ; IC4VF4-NEXT:    [[TMP116:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[RDX_MINMAX46]])
+; IC4VF4-NEXT:    [[TMP109:%.*]] = add nsw i16 [[TMP116]], -1
 ; IC4VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[TMP116]], 32767
-; IC4VF4-NEXT:    [[TMP112:%.*]] = add nsw i16 [[TMP116]], -1
-; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP112]], i16 0
+; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP109]], i16 0
 ; IC4VF4-NEXT:    br label %[[EXIT:.*]]
 ; IC4VF4:       [[EXIT]]:
 ; IC4VF4-NEXT:    ret i16 [[RDX_SELECT]]
@@ -480,7 +480,7 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) {
 ; IC1VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IC1VF4:       [[VECTOR_BODY]]:
 ; IC1VF4-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1VF4-NEXT:    [[TMP4:%.*]] = phi <4 x i16> [ <i16 12, i16 11, i16 10, i16 9>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 12, i16 11, i16 10, i16 9>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IC1VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; IC1VF4-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; IC1VF4-NEXT:    [[OFFSET_IDX:%.*]] = sub i16 12, [[DOTCAST]]
@@ -489,15 +489,15 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) {
 ; IC1VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x half>, ptr [[TMP2]], align 1
 ; IC1VF4-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x half> [[WIDE_LOAD]], <4 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; IC1VF4-NEXT:    [[TMP3:%.*]] = fcmp ugt <4 x half> [[REVERSE]], [[BROADCAST_SPLAT]]
-; IC1VF4-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[VEC_PHI]]
+; IC1VF4-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]]
 ; IC1VF4-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[TMP4]], splat (i16 -4)
+; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[VEC_IND]], splat (i16 -4)
 ; IC1VF4-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
 ; IC1VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IC1VF4:       [[MIDDLE_BLOCK]]:
 ; IC1VF4-NEXT:    [[TMP7:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[TMP5]])
-; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[TMP7]], 32767
 ; IC1VF4-NEXT:    [[TMP8:%.*]] = add nsw i16 [[TMP7]], -1
+; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[TMP7]], 32767
 ; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP8]], i16 0
 ; IC1VF4-NEXT:    br label %[[EXIT:.*]]
 ; IC1VF4:       [[EXIT]]:
@@ -514,10 +514,10 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) {
 ; IC4VF4:       [[VECTOR_BODY]]:
 ; IC4VF4-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE44:.*]] ]
 ; IC4VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 12, i16 11, i16 10, i16 9>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
-; IC4VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP108:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
-; IC4VF4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP109:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
-; IC4VF4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP110:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
-; IC4VF4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP111:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
+; IC4VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP112:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
+; IC4VF4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP113:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
+; IC4VF4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP114:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
+; IC4VF4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i16> [ splat (i16 32767), %[[VECTOR_PH]] ], [ [[TMP115:%.*]], %[[PRED_LOAD_CONTINUE44]] ]
 ; IC4VF4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 -4)
 ; IC4VF4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 -4)
 ; IC4VF4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i16> [[STEP_ADD_2]], splat (i16 -4)
@@ -697,21 +697,21 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) {
 ; IC4VF4-NEXT:    [[TMP101:%.*]] = fcmp ugt <4 x half> [[TMP51]], [[BROADCAST_SPLAT]]
 ; IC4VF4-NEXT:    [[TMP102:%.*]] = fcmp ugt <4 x half> [[TMP75]], [[BROADCAST_SPLAT]]
 ; IC4VF4-NEXT:    [[TMP103:%.*]] = fcmp ugt <4 x half> [[TMP99]], [[BROADCAST_SPLAT]]
-; IC4VF4-NEXT:    [[TMP108]] = select <4 x i1> [[TMP100]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]]
-; IC4VF4-NEXT:    [[TMP109]] = select <4 x i1> [[TMP101]], <4 x i16> [[STEP_ADD]], <4 x i16> [[VEC_PHI1]]
-; IC4VF4-NEXT:    [[TMP110]] = select <4 x i1> [[TMP102]], <4 x i16> [[STEP_ADD_2]], <4 x i16> [[VEC_PHI2]]
-; IC4VF4-NEXT:    [[TMP111]] = select <4 x i1> [[TMP103]], <4 x i16> [[STEP_ADD_3]], <4 x i16> [[VEC_PHI3]]
+; IC4VF4-NEXT:    [[TMP112]] = select <4 x i1> [[TMP100]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]]
+; IC4VF4-NEXT:    [[TMP113]] = select <4 x i1> [[TMP101]], <4 x i16> [[STEP_ADD]], <4 x i16> [[VEC_PHI1]]
+; IC4VF4-NEXT:    [[TMP114]] = select <4 x i1> [[TMP102]], <4 x i16> [[STEP_ADD_2]], <4 x i16> [[VEC_PHI2]]
+; IC4VF4-NEXT:    [[TMP115]] = select <4 x i1> [[TMP103]], <4 x i16> [[STEP_ADD_3]], <4 x i16> [[VEC_PHI3]]
 ; IC4VF4-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[STEP_ADD_3]], splat (i16 -4)
 ; IC4VF4-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IC4VF4:       [[MIDDLE_BLOCK]]:
-; IC4VF4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP108]], <4 x i16> [[TMP109]])
-; IC4VF4-NEXT:    [[RDX_MINMAX45:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX]], <4 x i16> [[TMP110]])
-; IC4VF4-NEXT:    [[RDX_MINMAX46:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX45]], <4 x i16> [[TMP111]])
+; IC4VF4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP112]], <4 x i16> [[TMP113]])
+; IC4VF4-NEXT:    [[RDX_MINMAX45:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX]], <4 x i16> [[TMP114]])
+; IC4VF4-NEXT:    [[RDX_MINMAX46:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[RDX_MINMAX45]], <4 x i16> [[TMP115]])
 ; IC4VF4-NEXT:    [[TMP116:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[RDX_MINMAX46]])
+; IC4VF4-NEXT:    [[TMP109:%.*]] = add nsw i16 [[TMP116]], -1
 ; IC4VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[TMP116]], 32767
-; IC4VF4-NEXT:    [[TMP112:%.*]] = add nsw i16 [[TMP116]], -1
-; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP112]], i16 0
+; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP109]], i16 0
 ; IC4VF4-NEXT:    br label %[[EXIT:.*]]
 ; IC4VF4:       [[EXIT]]:
 ; IC4VF4-NEXT:    ret i16 [[RDX_SELECT]]
@@ -986,16 +986,13 @@ define i64 @select_decreasing_induction_icmp_iv_just_within_bounds(ptr %a, ptr %
 ; IC1VF4-NEXT:  [[ENTRY:.*:]]
 ; IC1VF4-NEXT:    br label %[[VECTOR_PH:.*]]
 ; IC1VF4:       [[VECTOR_PH]]:
-; IC1VF4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
-; IC1VF4-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; IC1VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IC1VF4:       [[VECTOR_BODY]]:
 ; IC1VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IC1VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -1, i64 -2, i64 -3, i64 -4>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
-; IC1VF4-NEXT:    [[TMP9:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
-; IC1VF4-NEXT:    [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
-; IC1VF4-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; IC1VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 -1, [[INDEX]]
+; IC1VF4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
 ; IC1VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
 ; IC1VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 -3
 ; IC1VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
@@ -1005,23 +1002,22 @@ define i64 @select_decreasing_induction_icmp_iv_just_within_bounds(ptr %a, ptr %
 ; IC1VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
 ; IC1VF4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; IC1VF4-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]]
-; IC1VF4-NEXT:    [[TMP14:%.*]] = freeze <4 x i1> [[TMP8]]
-; IC1VF4-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
-; IC1VF4-NEXT:    [[TMP12]] = select i1 [[TMP11]], <4 x i1> [[TMP8]], <4 x i1> [[TMP9]]
-; IC1VF4-NEXT:    [[TMP13]] = select i1 [[TMP11]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]]
+; IC1VF4-NEXT:    [[TMP6]] = select <4 x i1> [[TMP8]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; IC1VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
 ; IC1VF4-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4
 ; IC1VF4-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IC1VF4:       [[MIDDLE_BLOCK]]:
-; IC1VF4-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
-; IC1VF4-NEXT:    [[TMP16:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP13]], <4 x i1> [[TMP12]], i64 [[TMP15]])
+; IC1VF4-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[TMP6]])
+; IC1VF4-NEXT:    [[TMP9:%.*]] = add i64 [[TMP12]], -1
+; IC1VF4-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP12]], -1
+; IC1VF4-NEXT:    [[TMP11:%.*]] = select i1 [[TMP13]], i64 [[TMP9]], i64 [[RDX_START]]
 ; IC1VF4-NEXT:    br label %[[SCALAR_PH:.*]]
 ; IC1VF4:       [[SCALAR_PH]]:
 ; IC1VF4-NEXT:    br label %[[LOOP:.*]]
 ; IC1VF4:       [[LOOP]]:
 ; IC1VF4-NEXT:    [[IV:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IC1VF4-NEXT:    [[RDX:%.*]] = phi i64 [ [[TMP16]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ]
+; IC1VF4-NEXT:    [[RDX:%.*]] = phi i64 [ [[TMP11]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ]
 ; IC1VF4-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
 ; IC1VF4-NEXT:    [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]]
 ; IC1VF4-NEXT:    [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1
@@ -1040,42 +1036,71 @@ define i64 @select_decreasing_induction_icmp_iv_just_within_bounds(ptr %a, ptr %
 ; IC4VF4-NEXT:  [[ENTRY:.*:]]
 ; IC4VF4-NEXT:    br label %[[VECTOR_PH:.*]]
 ; IC4VF4:       [[VECTOR_PH]]:
-; IC4VF4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0
-; IC4VF4-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; IC4VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IC4VF4:       [[VECTOR_BODY]]:
 ; IC4VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IC4VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -1, i64 -2, i64 -3, i64 -4>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC4VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
-; IC4VF4-NEXT:    [[TMP2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
-; IC4VF4-NEXT:    [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
-; IC4VF4-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; IC4VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP28:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP30:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC4VF4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 -4)
+; IC4VF4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 -4)
+; IC4VF4-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 -1, [[INDEX]]
+; IC4VF4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
 ; IC4VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
-; IC4VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -3
+; IC4VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -3
+; IC4VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -7
+; IC4VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -11
+; IC4VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -15
+; IC4VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
+; IC4VF4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; IC4VF4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
+; IC4VF4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
+; IC4VF4-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD4]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT:    [[REVERSE8:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD6]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; IC4VF4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -3
+; IC4VF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -7
+; IC4VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -11
+; IC4VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -15
+; IC4VF4-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i8>, ptr [[TMP16]], align 1
+; IC4VF4-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
+; IC4VF4-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1
 ; IC4VF4-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
+; IC4VF4-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD10]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT:    [[REVERSE16:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD12]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; IC4VF4-NEXT:    [[REVERSE17:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; IC4VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
-; IC4VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 -3
-; IC4VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1
-; IC4VF4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; IC4VF4-NEXT:    [[TMP26:%.*]] = icmp sgt <4 x i8> [[REVERSE17]], [[REVERSE2]]
-; IC4VF4-NEXT:    [[TMP10:%.*]] = freeze <4 x i1> [[TMP26]]
-; IC4VF4-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
-; IC4VF4-NEXT:    [[TMP12]] = select i1 [[TMP11]], <4 x i1> [[TMP26]], <4 x i1> [[TMP2]]
-; IC4VF4-NEXT:    [[TMP17]] = select i1 [[TMP11]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]]
-; IC4VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
-; IC4VF4-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4
-; IC4VF4-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IC4VF4-NEXT:    [[TMP23:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE14]]
+; IC4VF4-NEXT:    [[TMP24:%.*]] = icmp sgt <4 x i8> [[REVERSE7]], [[REVERSE15]]
+; IC4VF4-NEXT:    [[TMP25:%.*]] = icmp sgt <4 x i8> [[REVERSE8]], [[REVERSE16]]
+; IC4VF4-NEXT:    [[TMP26:%.*]] = icmp sgt <4 x i8> [[REVERSE9]], [[REVERSE17]]
+; IC4VF4-NEXT:    [[TMP27]] = select <4 x i1> [[TMP23]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; IC4VF4-NEXT:    [[TMP28]] = select <4 x i1> [[TMP24]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; IC4VF4-NEXT:    [[TMP29]] = select <4 x i1> [[TMP25]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; IC4VF4-NEXT:    [[TMP30]] = select <4 x i1> [[TMP26]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
+; IC4VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 -4)
+; IC4VF4-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], -16
+; IC4VF4-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IC4VF4:       [[MIDDLE_BLOCK]]:
-; IC4VF4-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0
-; IC4VF4-NEXT:    [[TMP16:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP17]], <4 x i1> [[TMP12]], i64 [[TMP19]])
+; IC4VF4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP27]], <4 x i64> [[TMP28]])
+; IC4VF4-NEXT:    [[RDX_MINMAX18:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP29]])
+; IC4VF4-NEXT:    [[RDX_MINMAX19:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[RDX_MINMAX18]], <4 x i64> [[TMP30]])
+; IC4VF4-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[RDX_MINMAX19]])
+; IC4VF4-NEXT:    [[SUB:%.+]] = add i64 [[TMP32]], -1
+; IC4VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP32]], -1
+; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[SUB]], i64 [[RDX_START]]
 ; IC4VF4-NEXT:    br label %[[SCALAR_PH:.*]]
 ; IC4VF4:       [[SCALAR_PH]]:
 ; IC4VF4-NEXT:    br label %[[LOOP:.*]]
 ; IC4VF4:       [[LOOP]]:
-; IC4VF4-NEXT:    [[IV:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IC4VF4-NEXT:    [[RDX:%.*]] = phi i64 [ [[TMP16]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ]
+; IC4VF4-NEXT:    [[IV:%.*]] = phi i64 [ 15, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC4VF4-NEXT:    [[RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ]
 ; IC4VF4-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
 ; IC4VF4-NEXT:    [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]]
 ; IC4VF4-NEXT:    [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1
@@ -1088,7 +1113,7 @@ define i64 @select_decreasing_induction_icmp_iv_just_within_bounds(ptr %a, ptr %
 ; IC4VF4:       [[EXIT]]:
 ; IC4VF4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ]
 ; IC4VF4-NEXT:    ret i64 [[COND_LCSSA]]
-;
+;;
 ; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_iv_just_within_bounds(
 ; IC4VF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) {
 ; IC4VF1-NEXT:  [[ENTRY:.*:]]
@@ -1200,9 +1225,9 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
 ; IC1VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IC1VF4:       [[VECTOR_BODY]]:
 ; IC1VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1VF4-NEXT:    [[TMP6:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
-; IC1VF4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 9223372036854775807), %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
 ; IC1VF4-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[INDEX]]
 ; IC1VF4-NEXT:    [[TMP4:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
 ; IC1VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
@@ -1214,17 +1239,18 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
 ; IC1VF4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
 ; IC1VF4-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; IC1VF4-NEXT:    [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]]
-; IC1VF4-NEXT:    [[TMP14]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP6]], <4 x i64> [[VEC_PHI]]
-; IC1VF4-NEXT:    [[TMP12]] = or <4 x i1> [[VEC_PHI1]], [[TMP11]]
+; IC1VF4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP11]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; IC1VF4-NEXT:    [[TMP17]] = or <4 x i1> [[VEC_PHI1]], [[TMP11]]
 ; IC1VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[TMP6]], splat (i64 -4)
+; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4)
 ; IC1VF4-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IC1VF4-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; IC1VF4:       [[MIDDLE_BLOCK]]:
-; IC1VF4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP14]])
-; IC1VF4-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]])
-; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = freeze i1 [[TMP17]]
-; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP15]], i64 [[RDX_START]]
+; IC1VF4-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP12]])
+; IC1VF4-NEXT:    [[TMP18:%.*]] = add nsw i64 [[TMP13]], -1
+; IC1VF4-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
+; IC1VF4-NEXT:    [[TMP15:%.*]] = freeze i1 [[TMP14]]
+; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP15]], i64 [[TMP18]], i64 [[RDX_START]]
 ; IC1VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; IC1VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; IC1VF4:       [[SCALAR_PH]]:
@@ -1326,12 +1352,13 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64
 ; IC4VF4-NEXT:    [[RDX_MINMAX23:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP36]])
 ; IC4VF4-NEXT:    [[RDX_MINMAX25:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[RDX_MINMAX23]], <4 x i64> [[TMP37]])
 ; IC4VF4-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[RDX_MINMAX25]])
+; IC4VF4-NEXT:    [[SUB:%.*]] = add nsw i64 [[TMP39]], -1
 ; IC4VF4-NEXT:    [[BIN_RDX:%.*]] = or <4 x i1> [[TMP31]], [[TMP30]]
 ; IC4VF4-NEXT:    [[BIN_RDX25:%.*]] = or <4 x i1> [[TMP32]], [[BIN_RDX]]
 ; IC4VF4-NEXT:    [[BIN_RDX24:%.*]] = or <4 x i1> [[TMP33]], [[BIN_RDX25]]
 ; IC4VF4-NEXT:    [[TMP40:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX24]])
 ; IC4VF4-NEXT:    [[TMP41:%.*]] = freeze i1 [[TMP40]]
-; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP41]], i64 [[TMP39]], i64 [[RDX_START]]
+; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP41]], i64 [[SUB]], i64 [[RDX_START]]
 ; IC4VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; IC4VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; IC4VF4:       [[SCALAR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
index 6ebc81bfc7ee3..b87a604d6f32c 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
@@ -1978,9 +1978,9 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP5]])
-; CHECK-VF4IC1-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
-; CHECK-VF4IC1-NEXT:    [[TMP9:%.*]] = freeze i1 [[TMP10]]
-; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP9]], i64 [[TMP7]], i64 [[RDX_START]]
+; CHECK-VF4IC1-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = freeze i1 [[TMP9]]
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP7]], i64 [[RDX_START]]
 ; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK-VF4IC1:       [[SCALAR_PH]]:
@@ -2048,31 +2048,31 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i64>, ptr [[TMP21]], align 8
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD8]], [[WIDE_LOAD12]]
-; CHECK-VF4IC4-NEXT:    [[TMP25:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD9]], [[WIDE_LOAD13]]
-; CHECK-VF4IC4-NEXT:    [[TMP26:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD10]], [[WIDE_LOAD14]]
+; CHECK-VF4IC4-NEXT:    [[TMP24:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD9]], [[WIDE_LOAD13]]
+; CHECK-VF4IC4-NEXT:    [[TMP25:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD10]], [[WIDE_LOAD14]]
 ; CHECK-VF4IC4-NEXT:    [[TMP13]] = or <4 x i1> [[VEC_PHI4]], [[TMP3]]
 ; CHECK-VF4IC4-NEXT:    [[TMP14]] = or <4 x i1> [[VEC_PHI5]], [[TMP10]]
-; CHECK-VF4IC4-NEXT:    [[TMP15]] = or <4 x i1> [[VEC_PHI6]], [[TMP25]]
-; CHECK-VF4IC4-NEXT:    [[TMP16]] = or <4 x i1> [[VEC_PHI7]], [[TMP26]]
+; CHECK-VF4IC4-NEXT:    [[TMP15]] = or <4 x i1> [[VEC_PHI6]], [[TMP24]]
+; CHECK-VF4IC4-NEXT:    [[TMP16]] = or <4 x i1> [[VEC_PHI7]], [[TMP25]]
 ; CHECK-VF4IC4-NEXT:    [[TMP17]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND1]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC4-NEXT:    [[TMP18]] = select <4 x i1> [[TMP10]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
-; CHECK-VF4IC4-NEXT:    [[TMP19]] = select <4 x i1> [[TMP25]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
-; CHECK-VF4IC4-NEXT:    [[TMP20]] = select <4 x i1> [[TMP26]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[TMP19]] = select <4 x i1> [[TMP24]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP20]] = select <4 x i1> [[TMP25]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP17]], <4 x i64> [[TMP18]])
-; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX15:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP19]])
-; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX16:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX15]], <4 x i64> [[TMP20]])
-; CHECK-VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX16]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX16:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP19]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX18:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX16]], <4 x i64> [[TMP20]])
+; CHECK-VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX18]])
 ; CHECK-VF4IC4-NEXT:    [[BIN_RDX:%.*]] = or <4 x i1> [[TMP14]], [[TMP13]]
-; CHECK-VF4IC4-NEXT:    [[BIN_RDX17:%.*]] = or <4 x i1> [[TMP15]], [[BIN_RDX]]
-; CHECK-VF4IC4-NEXT:    [[BIN_RDX18:%.*]] = or <4 x i1> [[TMP16]], [[BIN_RDX17]]
-; CHECK-VF4IC4-NEXT:    [[TMP23:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX18]])
-; CHECK-VF4IC4-NEXT:    [[TMP24:%.*]] = freeze i1 [[TMP23]]
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP24]], i64 [[TMP22]], i64 [[RDX_START]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX18:%.*]] = or <4 x i1> [[TMP15]], [[BIN_RDX]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX17:%.*]] = or <4 x i1> [[TMP16]], [[BIN_RDX18]]
+; CHECK-VF4IC4-NEXT:    [[TMP23:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX17]])
+; CHECK-VF4IC4-NEXT:    [[TMP26:%.*]] = freeze i1 [[TMP23]]
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP26]], i64 [[TMP22]], i64 [[RDX_START]]
 ; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK-VF4IC4:       [[SCALAR_PH]]:
@@ -2158,13 +2158,13 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
 ; CHECK-VF1IC4-NEXT:    br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[COND]], i64 [[TMP32]])
-; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX8:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP33]])
-; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX9:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX8]], i64 [[TMP34]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX9:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP33]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX11:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX9]], i64 [[TMP34]])
 ; CHECK-VF1IC4-NEXT:    [[BIN_RDX:%.*]] = or i1 [[TMP28]], [[TMP27]]
 ; CHECK-VF1IC4-NEXT:    [[BIN_RDX10:%.*]] = or i1 [[TMP29]], [[BIN_RDX]]
 ; CHECK-VF1IC4-NEXT:    [[BIN_RDX11:%.*]] = or i1 [[TMP30]], [[BIN_RDX10]]
-; CHECK-VF1IC4-NEXT:    [[TMP36:%.*]] = freeze i1 [[BIN_RDX11]]
-; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP36]], i64 [[RDX_MINMAX9]], i64 [[RDX_START]]
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = freeze i1 [[BIN_RDX11]]
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX11]], i64 [[RDX_START]]
 ; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK-VF1IC4:       [[SCALAR_PH]]:
@@ -2177,10 +2177,10 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.
 ; CHECK-VF1IC4-NEXT:    [[IV_I1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY1]] ], [ [[BC_RESUME_VAL12]], %[[SCALAR_PH]] ]
 ; CHECK-VF1IC4-NEXT:    [[RDX1:%.*]] = phi i64 [ [[COND1:%.*]], %[[FOR_BODY1]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
 ; CHECK-VF1IC4-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I1]]
-; CHECK-VF1IC4-NEXT:    [[TMP37:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP36:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
 ; CHECK-VF1IC4-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I1]]
-; CHECK-VF1IC4-NEXT:    [[TMP38:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
-; CHECK-VF1IC4-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP37]], [[TMP38]]
+; CHECK-VF1IC4-NEXT:    [[TMP37:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP36]], [[TMP37]]
 ; CHECK-VF1IC4-NEXT:    [[COND1]] = select i1 [[CMP3]], i64 [[IV_J1]], i64 [[RDX1]]
 ; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I1]], 1
 ; CHECK-VF1IC4-NEXT:    [[INC3]] = add nsw i64 [[IV_J1]], 1
@@ -2246,9 +2246,9 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP6]])
-; CHECK-VF4IC1-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-VF4IC1-NEXT:    [[TMP10:%.*]] = freeze i1 [[TMP11]]
-; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP10]], i64 [[TMP8]], i64 [[RDX_START]]
+; CHECK-VF4IC1-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = freeze i1 [[TMP10]]
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 [[RDX_START]]
 ; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK-VF4IC1:       [[SCALAR_PH]]:
@@ -2318,31 +2318,31 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i64>, ptr [[TMP22]], align 8
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD3]]
 ; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD8]], [[WIDE_LOAD12]]
-; CHECK-VF4IC4-NEXT:    [[TMP26:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD9]], [[WIDE_LOAD13]]
-; CHECK-VF4IC4-NEXT:    [[TMP27:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD10]], [[WIDE_LOAD14]]
+; CHECK-VF4IC4-NEXT:    [[TMP25:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD9]], [[WIDE_LOAD13]]
+; CHECK-VF4IC4-NEXT:    [[TMP26:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD10]], [[WIDE_LOAD14]]
 ; CHECK-VF4IC4-NEXT:    [[TMP14]] = or <4 x i1> [[VEC_PHI4]], [[TMP4]]
 ; CHECK-VF4IC4-NEXT:    [[TMP15]] = or <4 x i1> [[VEC_PHI5]], [[TMP11]]
-; CHECK-VF4IC4-NEXT:    [[TMP16]] = or <4 x i1> [[VEC_PHI6]], [[TMP26]]
-; CHECK-VF4IC4-NEXT:    [[TMP17]] = or <4 x i1> [[VEC_PHI7]], [[TMP27]]
+; CHECK-VF4IC4-NEXT:    [[TMP16]] = or <4 x i1> [[VEC_PHI6]], [[TMP25]]
+; CHECK-VF4IC4-NEXT:    [[TMP17]] = or <4 x i1> [[VEC_PHI7]], [[TMP26]]
 ; CHECK-VF4IC4-NEXT:    [[TMP18]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_IND1]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC4-NEXT:    [[TMP19]] = select <4 x i1> [[TMP11]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
-; CHECK-VF4IC4-NEXT:    [[TMP20]] = select <4 x i1> [[TMP26]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
-; CHECK-VF4IC4-NEXT:    [[TMP21]] = select <4 x i1> [[TMP27]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[TMP20]] = select <4 x i1> [[TMP25]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP21]] = select <4 x i1> [[TMP26]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP18]], <4 x i64> [[TMP19]])
-; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX15:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP20]])
-; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX16:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX15]], <4 x i64> [[TMP21]])
-; CHECK-VF4IC4-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX16]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX16:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP20]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX18:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX16]], <4 x i64> [[TMP21]])
+; CHECK-VF4IC4-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX18]])
 ; CHECK-VF4IC4-NEXT:    [[BIN_RDX:%.*]] = or <4 x i1> [[TMP15]], [[TMP14]]
-; CHECK-VF4IC4-NEXT:    [[BIN_RDX17:%.*]] = or <4 x i1> [[TMP16]], [[BIN_RDX]]
-; CHECK-VF4IC4-NEXT:    [[BIN_RDX18:%.*]] = or <4 x i1> [[TMP17]], [[BIN_RDX17]]
-; CHECK-VF4IC4-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX18]])
-; CHECK-VF4IC4-NEXT:    [[TMP25:%.*]] = freeze i1 [[TMP24]]
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP25]], i64 [[TMP23]], i64 [[RDX_START]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX18:%.*]] = or <4 x i1> [[TMP16]], [[BIN_RDX]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX17:%.*]] = or <4 x i1> [[TMP17]], [[BIN_RDX18]]
+; CHECK-VF4IC4-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX17]])
+; CHECK-VF4IC4-NEXT:    [[TMP27:%.*]] = freeze i1 [[TMP24]]
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP27]], i64 [[TMP23]], i64 [[RDX_START]]
 ; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK-VF4IC4:       [[SCALAR_PH]]:
@@ -2423,13 +2423,13 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
 ; CHECK-VF1IC4-NEXT:    br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[COND]], i64 [[TMP30]])
-; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX8:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP31]])
-; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX9:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX8]], i64 [[TMP32]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX9:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX]], i64 [[TMP31]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX11:%.*]] = call i64 @llvm.smax.i64(i64 [[RDX_MINMAX9]], i64 [[TMP32]])
 ; CHECK-VF1IC4-NEXT:    [[BIN_RDX:%.*]] = or i1 [[TMP26]], [[TMP25]]
 ; CHECK-VF1IC4-NEXT:    [[BIN_RDX10:%.*]] = or i1 [[TMP27]], [[BIN_RDX]]
 ; CHECK-VF1IC4-NEXT:    [[BIN_RDX11:%.*]] = or i1 [[TMP28]], [[BIN_RDX10]]
-; CHECK-VF1IC4-NEXT:    [[TMP34:%.*]] = freeze i1 [[BIN_RDX11]]
-; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP34]], i64 [[RDX_MINMAX9]], i64 [[RDX_START]]
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = freeze i1 [[BIN_RDX11]]
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX11]], i64 [[RDX_START]]
 ; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP5]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK-VF1IC4:       [[SCALAR_PH]]:
@@ -2440,10 +2440,10 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b,
 ; CHECK-VF1IC4-NEXT:    [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY1]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; CHECK-VF1IC4-NEXT:    [[RDX1:%.*]] = phi i64 [ [[COND1:%.*]], %[[FOR_BODY1]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
 ; CHECK-VF1IC4-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
-; CHECK-VF1IC4-NEXT:    [[TMP35:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP34:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
 ; CHECK-VF1IC4-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV1]]
-; CHECK-VF1IC4-NEXT:    [[TMP36:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
-; CHECK-VF1IC4-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP35]], [[TMP36]]
+; CHECK-VF1IC4-NEXT:    [[TMP35:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP34]], [[TMP35]]
 ; CHECK-VF1IC4-NEXT:    [[COND1]] = select i1 [[CMP3]], i64 [[IV1]], i64 [[RDX1]]
 ; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
@@ -2490,20 +2490,20 @@ define i64 @select_icmp_xor_not_iv(ptr %a, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4IC1:       [[VECTOR_BODY]]:
 ; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-VF4IC1-NEXT:    [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i64> [[TMP2]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:    [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[TMP2]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP3]])
-; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP7]], -9223372036854775808
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP7]], -1
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP7]], -9223372036854775808
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP5]], i64 0
 ; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
@@ -2576,8 +2576,8 @@ define i64 @select_icmp_xor_not_iv(ptr %a, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX7:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP10]])
 ; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX8:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[RDX_MINMAX7]], <4 x i64> [[TMP11]])
 ; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX8]])
-; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP13]], -9223372036854775808
 ; CHECK-VF4IC4-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP13]], -1
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP13]], -9223372036854775808
 ; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP17]], i64 0
 ; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]



More information about the llvm-commits mailing list