[llvm] Vplan caniv defined by region (PR #144803)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 18 14:51:26 PDT 2025
https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/144803
None
>From 9e2e2aafdd3cd49e271e53caad7104c3ca494422 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 12 Jun 2025 18:49:45 +0100
Subject: [PATCH 1/2] Remove dead can IV
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1 +
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 14 +++++++++++++-
.../AArch64/force-target-instruction-cost.ll | 8 +++-----
.../Transforms/LoopVectorize/X86/cost-model.ll | 4 ----
.../LoopVectorize/X86/induction-costs.ll | 2 --
.../X86/outer_loop_test1_no_explicit_vect_width.ll | 4 +---
6 files changed, 18 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3b16248f962bc..82e19dc79194e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7346,6 +7346,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// Regions are dissolved after optimizing for VF and UF, which completely
// removes unneeded loop regions first.
VPlanTransforms::dissolveLoopRegions(BestVPlan);
+ VPlanTransforms::removeDeadRecipes(BestVPlan);
// Perform the actual loop transformation.
VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
OrigLoop->getParentLoop(),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cc73ae44f9c0b..8b86b13d70a6d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -554,8 +554,20 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
// The recipes in the block are processed in reverse order, to catch chains
// of dead recipes.
for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
- if (isDeadRecipe(R))
+ if (isDeadRecipe(R)) {
R.eraseFromParent();
+ continue;
+ }
+
+ // Check if R is a dead VPPhi <-> update cycle and remove it
+ auto *PhiR = dyn_cast<VPPhi>(&R);
+ if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1 ||
+ *PhiR->user_begin() != PhiR->getOperand(1)->getDefiningRecipe() ||
+ PhiR->getOperand(1)->getNumUsers() != 1)
+ continue;
+ PhiR->replaceAllUsesWith(PhiR->getOperand(0));
+ PhiR->setOperand(0, PhiR->getOperand(0));
+ PhiR->eraseFromParent();
}
}
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index 095ac222e1789..a061714e16b19 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -11,12 +11,10 @@ define double @test_reduction_costs() {
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP0:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0]] = call double @llvm.vector.reduce.fadd.v2f64(double [[VEC_PHI]], <2 x double> splat (double 3.000000e+00))
; CHECK-NEXT: [[TMP1]] = call double @llvm.vector.reduce.fadd.v2f64(double [[VEC_PHI1]], <2 x double> splat (double 9.000000e+00))
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -103,11 +101,11 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) {
; CHECK-NEXT: [[IND_END5:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC3]]
; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[INDEX4]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
; CHECK-NEXT: store <4 x i8> zeroinitializer, ptr [[TMP2]], align 1
-; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX4]], 4
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC3]]
; CHECK-NEXT: br i1 [[TMP7]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 2c6fe4f5c808e..b3b082800af45 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -1103,10 +1103,8 @@ define i64 @cost_loop_invariant_recipes(i1 %x, i64 %y) {
; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[BROADCAST_SPLAT2]], [[TMP1]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ splat (i64 1), [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3]] = mul <2 x i64> [[TMP2]], [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[TMP3]])
@@ -1158,13 +1156,11 @@ define i32 @narrowed_reduction(ptr %a, i1 %cmp) #0 {
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i32> [[VEC_PHI1]], splat (i32 1)
; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP1]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP5:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i1>
; CHECK-NEXT: [[TMP7]] = zext <16 x i1> [[TMP5]] to <16 x i32>
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP10:%.*]] = trunc <16 x i32> [[TMP7]] to <16 x i1>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index 0a85548f8750b..6eb6d19874514 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -405,13 +405,11 @@ define i16 @iv_and_step_trunc() {
; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 1)
; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i16>
; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i16> [[VEC_IND1]], [[TMP1]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], splat (i16 2)
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
index 02d48cbda1aab..194c31747a94d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
@@ -52,8 +52,7 @@
; AVX: %[[Splat:.*]] = shufflevector <8 x i32> %[[SplatVal]], <8 x i32> poison, <8 x i32> zeroinitializer
; AVX-LABEL: vector.body:
-; AVX: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
-; AVX: %[[VecInd:.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
+; AVX: %[[VecInd:.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc:.*]] ]
; AVX: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <8 x i64> %[[VecInd]]
; AVX: %[[VecIndTr:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32>
; AVX: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %[[VecIndTr]], <8 x ptr> %[[AAddr]], i32 4, <8 x i1> splat (i1 true))
@@ -71,7 +70,6 @@
; AVX: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
; AVX: [[ForInc]]:
-; AVX: %[[IndNext]] = add nuw i64 %[[Ind]], 8
; AVX: %[[VecIndNext]] = add <8 x i64> %[[VecInd]], splat (i64 8)
; AVX: br i1 true, label %middle.block, label %vector.body
>From b26546a9383643e01331c261f8cf728800c407b5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 5 Jun 2025 22:17:46 +0100
Subject: [PATCH 2/2] [VPlan] Make CanIV part of region.
Step
Step
Step
---
.../Transforms/Vectorize/LoopVectorize.cpp | 112 ++++++++++++------
llvm/lib/Transforms/Vectorize/VPlan.cpp | 57 +++++++--
llvm/lib/Transforms/Vectorize/VPlan.h | 39 ++++--
.../Transforms/Vectorize/VPlanAnalysis.cpp | 15 ++-
.../Vectorize/VPlanConstruction.cpp | 16 +--
.../Transforms/Vectorize/VPlanPatternMatch.h | 6 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 34 ++++--
llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 1 +
llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 3 +-
.../Transforms/Vectorize/VPlanVerifier.cpp | 6 -
.../AArch64/conditional-branches-cost.ll | 8 +-
.../AArch64/divs-with-scalable-vfs.ll | 4 +-
.../AArch64/induction-costs-sve.ll | 8 +-
.../AArch64/masked-call-scalarize.ll | 4 +-
.../LoopVectorize/AArch64/masked-call.ll | 2 +-
.../LoopVectorize/AArch64/optsize_minsize.ll | 8 +-
.../AArch64/partial-reduce-dot-product.ll | 6 +-
.../AArch64/reduction-recurrence-costs-sve.ll | 6 +-
.../AArch64/scalable-strict-fadd.ll | 14 +--
.../LoopVectorize/AArch64/store-costs-sve.ll | 2 +-
.../sve-interleaved-masked-accesses.ll | 8 +-
.../AArch64/sve-tail-folding-forced.ll | 2 +-
.../sve-tail-folding-overflow-checks.ll | 2 +-
.../AArch64/sve-tail-folding-reductions.ll | 12 +-
.../AArch64/sve-tail-folding-unroll.ll | 4 +-
.../LoopVectorize/AArch64/sve-tail-folding.ll | 20 ++--
.../LoopVectorize/AArch64/sve2-histcnt.ll | 2 +-
.../AArch64/tail-fold-uniform-memops.ll | 2 +-
.../AArch64/tail-folding-styles.ll | 2 +-
...eave-to-widen-memory-remove-loop-region.ll | 2 +-
.../AArch64/uniform-args-call-variants.ll | 10 +-
.../Transforms/LoopVectorize/RISCV/pr88802.ll | 2 +-
.../LoopVectorize/X86/constant-fold.ll | 4 +-
.../X86/drop-poison-generating-flags.ll | 24 ++--
.../LoopVectorize/X86/interleave-cost.ll | 4 +-
...eref-pred-poison-ub-ops-feeding-pointer.ll | 10 +-
.../Transforms/LoopVectorize/uniform-blend.ll | 2 +-
38 files changed, 283 insertions(+), 182 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 82e19dc79194e..79f93cd9d04fd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7347,11 +7347,39 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// removes unneeded loop regions first.
VPlanTransforms::dissolveLoopRegions(BestVPlan);
VPlanTransforms::removeDeadRecipes(BestVPlan);
+
+ // Don't use getDecisionAndClampRange here, because we don't know the UF
+ // so this function is better to be conservative, rather than to split
+ // it up into different VPlans.
+ // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
+ bool IVUpdateMayOverflow = false;
+ IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, BestVF);
+
+ TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
+ // Use NUW for the induction increment if we proved that it won't overflow in
+ // the vector loop or when not folding the tail. In the later case, we know
+ // that the canonical induction increment will not overflow as the vector trip
+ // count is >= increment and a multiple of the increment.
+
// Perform the actual loop transformation.
VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
OrigLoop->getParentLoop(),
Legal->getWidestInductionType());
+ bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
+ if (!HasNUW) {
+ using namespace llvm::VPlanPatternMatch;
+ VPBasicBlock *HeaderVPBB =
+ vputils::getFirstLoopHeader(BestVPlan, State.VPDT);
+ auto *CanIV = cast<VPPhi>(&HeaderVPBB->front());
+ for (auto &P : HeaderVPBB->phis()) {
+ auto *IVInc = P.getOperand(1);
+ if (match(IVInc, m_VPInstruction<Instruction::Add>(m_Specific(CanIV),
+ m_VPValue())))
+ cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags();
+ }
+ }
+
#ifdef EXPENSIVE_CHECKS
assert(DT->verify(DominatorTree::VerificationLevel::Fast));
#endif
@@ -9760,48 +9788,54 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
Header->setName("vec.epilog.vector.body");
DenseMap<Value *, Value *> ToFrozen;
+
+ auto *IV = Plan.getCanonicalIV();
+ // When vectorizing the epilogue loop, the canonical induction start
+ // value needs to be changed from zero to the value after the main
+ // vector loop. Find the resume value created during execution of the main
+ // VPlan.
+ // FIXME: Improve modeling for canonical IV start values in the epilogue
+ // loop.
+ BasicBlock *MainMiddle = find_singleton<BasicBlock>(
+ predecessors(L->getLoopPreheader()),
+ [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
+ if (BB != EPI.MainLoopIterationCountCheck &&
+ BB != EPI.EpilogueIterationCountCheck &&
+ BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
+ return BB;
+ return nullptr;
+ });
+ using namespace llvm::PatternMatch;
+ Type *IdxTy = IV->getScalarType();
+ PHINode *EPResumeVal = find_singleton<PHINode>(
+ L->getLoopPreheader()->phis(),
+ [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
+ if (P.getType() == IdxTy &&
+ P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
+ match(P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
+ m_SpecificInt(0)))
+ return &P;
+ return nullptr;
+ });
+ assert(EPResumeVal && "must have a resume value for the canonical IV");
+ VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
+ assert(all_of(IV->users(),
+ [](const VPUser *U) {
+ return isa<VPScalarIVStepsRecipe>(U) ||
+ isa<VPDerivedIVRecipe>(U) ||
+ cast<VPRecipeBase>(U)->isScalarCast() ||
+ cast<VPInstruction>(U)->getOpcode() ==
+ Instruction::Add ||
+ cast<VPInstruction>(U)->getOpcode() ==
+ VPInstruction::BranchOnCount;
+ }) &&
+ "the canonical IV should only be used by its increment or "
+ "ScalarIVSteps when resetting the start value");
+ IV->setOperand(0, VPV);
+
// Ensure that the start values for all header phi recipes are updated before
// vectorizing the epilogue loop.
for (VPRecipeBase &R : Header->phis()) {
- if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
- // When vectorizing the epilogue loop, the canonical induction start
- // value needs to be changed from zero to the value after the main
- // vector loop. Find the resume value created during execution of the main
- // VPlan.
- // FIXME: Improve modeling for canonical IV start values in the epilogue
- // loop.
- using namespace llvm::PatternMatch;
- Type *IdxTy = IV->getScalarType();
- PHINode *EPResumeVal = find_singleton<PHINode>(
- L->getLoopPreheader()->phis(),
- [&EPI, IdxTy](PHINode &P, bool) -> PHINode * {
- if (P.getType() == IdxTy &&
- match(
- P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
- m_SpecificInt(0)) &&
- all_of(P.incoming_values(), [&EPI](Value *Inc) {
- return Inc == EPI.VectorTripCount ||
- match(Inc, m_SpecificInt(0));
- }))
- return &P;
- return nullptr;
- });
- assert(EPResumeVal && "must have a resume value for the canonical IV");
- VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
- assert(all_of(IV->users(),
- [](const VPUser *U) {
- return isa<VPScalarIVStepsRecipe>(U) ||
- isa<VPDerivedIVRecipe>(U) ||
- cast<VPRecipeBase>(U)->isScalarCast() ||
- cast<VPInstruction>(U)->getOpcode() ==
- Instruction::Add;
- }) &&
- "the canonical IV should only be used by its increment or "
- "ScalarIVSteps when resetting the start value");
- IV->setOperand(0, VPV);
- continue;
- }
-
Value *ResumeV = nullptr;
// TODO: Move setting of resume values to prepareToExecute.
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 773a5a4a829c7..213cbd07466dc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -777,6 +777,9 @@ VPRegionBlock *VPRegionBlock::clone() {
getName(), isReplicator());
for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry))
Block->setParent(NewRegion);
+
+ if (CanIV)
+ NewRegion->CanIV = CanIV->clone();
return NewRegion;
}
@@ -882,18 +885,41 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
void VPRegionBlock::dissolveToCFGLoop() {
auto *Header = cast<VPBasicBlock>(getEntry());
+ auto *ExitingLatch = cast<VPBasicBlock>(getExiting());
+ if (CanIV && CanIV->getNumUsers() > 0) {
+ auto *ExitingTerm = ExitingLatch->getTerminator();
+ VPInstruction *CanIVInc = nullptr;
+ for (VPUser *U : CanIV->users()) {
+ if (match(U, m_Add(m_VPValue(), m_Specific(&getPlan()->getVFxUF())))) {
+ CanIVInc = cast<VPInstruction>(U);
+ }
+ }
+ if (!CanIVInc) {
+ VPValue *Count;
+ if (match(ExitingLatch->getTerminator(),
+ m_BranchOnCount(m_VPValue(Count), m_VPValue())))
+ CanIVInc = cast<VPInstruction>(Count);
+ }
+ if (!CanIVInc) {
+ CanIVInc =
+ VPBuilder(ExitingTerm)
+ .createOverflowingOp(
+ Instruction::Add, {CanIV, &getPlan()->getVFxUF()},
+ {true, false}, ExitingTerm->getDebugLoc(), "index.next");
+ }
+ auto *ScalarR = VPBuilder(Header, Header->begin())
+ .createScalarPhi({CanIV->getStartValue(), CanIVInc},
+ CanIV->getDebugLoc(), "index");
+ CanIV->replaceAllUsesWith(ScalarR);
+ if (ExitingTerm->getOperand(0) == ScalarR)
+ ExitingTerm->setOperand(0, CanIVInc);
+ }
if (auto *CanIV = dyn_cast<VPCanonicalIVPHIRecipe>(&Header->front())) {
assert(this == getPlan()->getVectorLoopRegion() &&
"Canonical IV must be in the entry of the top-level loop region");
- auto *ScalarR = VPBuilder(CanIV).createScalarPhi(
- {CanIV->getStartValue(), CanIV->getBackedgeValue()},
- CanIV->getDebugLoc(), "index");
- CanIV->replaceAllUsesWith(ScalarR);
- CanIV->eraseFromParent();
}
VPBlockBase *Preheader = getSinglePredecessor();
- auto *ExitingLatch = cast<VPBasicBlock>(getExiting());
VPBlockBase *Middle = getSingleSuccessor();
VPBlockUtils::disconnectBlocks(Preheader, this);
VPBlockUtils::disconnectBlocks(this, Middle);
@@ -930,7 +956,12 @@ VPlan::~VPlan() {
for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
R.setOperand(I, &DummyValue);
}
+ } else {
+ if (cast<VPRegionBlock>(VPB)->getCanonicalIV())
+ cast<VPRegionBlock>(VPB)->getCanonicalIV()->replaceAllUsesWith(
+ &DummyValue);
}
+
delete VPB;
}
for (VPValue *VPV : getLiveIns())
@@ -1042,7 +1073,12 @@ void VPlan::execute(VPTransformState *State) {
// Move the last step to the end of the latch block. This ensures
// consistent placement of all induction updates.
Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
- Inc->moveBefore(std::prev(VectorLatchBB->getTerminator()->getIterator()));
+ if (VectorLatchBB->getTerminator()->getIterator() !=
+ VectorLatchBB->begin())
+ Inc->moveBefore(
+ std::prev(VectorLatchBB->getTerminator()->getIterator()));
+ else
+ Inc->moveBefore(VectorLatchBB->getTerminator()->getIterator());
continue;
}
@@ -1251,6 +1287,11 @@ VPlan *VPlan::duplicate() {
// else NewTripCount will be created and inserted into Old2NewVPValues when
// TripCount is cloned. In any case NewPlan->TripCount is updated below.
+ if (auto *LoopRegion = getVectorLoopRegion()) {
+ Old2NewVPValues[LoopRegion->getCanonicalIV()] =
+ NewPlan->getVectorLoopRegion()->getCanonicalIV();
+ }
+
remapOperands(Entry, NewEntry, Old2NewVPValues);
// Initialize remaining fields of cloned VPlan.
@@ -1432,6 +1473,8 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
/// loop region.
static bool isDefinedInsideLoopRegions(const VPValue *VPV) {
const VPRecipeBase *DefR = VPV->getDefiningRecipe();
+ if (DefR && isa<VPCanonicalIVPHIRecipe>(DefR))
+ return true;
return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() ||
DefR->getParent()->getEnclosingLoopRegion());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index ab5ff82a77208..291b830922296 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3247,7 +3247,6 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
VPCanonicalIVPHIRecipe *clone() override {
auto *R = new VPCanonicalIVPHIRecipe(getOperand(0), getDebugLoc());
- R->addOperand(getBackedgeValue());
return R;
}
@@ -3783,6 +3782,8 @@ class VPRegionBlock : public VPBlockBase {
/// instances of output IR corresponding to its VPBlockBases.
bool IsReplicator;
+ VPCanonicalIVPHIRecipe *CanIV = nullptr;
+
/// Use VPlan::createVPRegionBlock to create VPRegionBlocks.
VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
const std::string &Name = "", bool IsReplicator = false)
@@ -3793,12 +3794,26 @@ class VPRegionBlock : public VPBlockBase {
Entry->setParent(this);
Exiting->setParent(this);
}
+ VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, VPValue *StartV,
+ const std::string &Name = "")
+ : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting),
+ IsReplicator(false),
+ CanIV(new VPCanonicalIVPHIRecipe(StartV, DebugLoc())) {
+ assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
+ assert(Exiting->getSuccessors().empty() && "Exit block has successors.");
+ Entry->setParent(this);
+ Exiting->setParent(this);
+ }
+
VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
: VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
IsReplicator(IsReplicator) {}
public:
- ~VPRegionBlock() override {}
+ ~VPRegionBlock() override {
+ if (CanIV)
+ delete CanIV;
+ }
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPBlockBase *V) {
@@ -3815,6 +3830,11 @@ class VPRegionBlock : public VPBlockBase {
"Entry block cannot have predecessors.");
Entry = EntryBlock;
EntryBlock->setParent(this);
+ if (!cast<VPBasicBlock>(Entry)->empty() &&
+ isa<VPCanonicalIVPHIRecipe>(cast<VPBasicBlock>(Entry)->front())) {
+ CanIV = &cast<VPCanonicalIVPHIRecipe>(cast<VPBasicBlock>(Entry)->front());
+ CanIV->removeFromParent();
+ }
}
const VPBlockBase *getExiting() const { return Exiting; }
@@ -3865,6 +3885,9 @@ class VPRegionBlock : public VPBlockBase {
/// Remove the current region from its VPlan, connecting its predecessor to
/// its entry, and its exiting block to its successor.
void dissolveToCFGLoop();
+
+ VPCanonicalIVPHIRecipe *getCanonicalIV() { return CanIV; }
+ const VPCanonicalIVPHIRecipe *getCanonicalIV() const { return CanIV; }
};
/// VPlan models a candidate for vectorization, encoding various decisions take
@@ -4166,12 +4189,7 @@ class VPlan {
/// Returns the canonical induction recipe of the vector loop.
VPCanonicalIVPHIRecipe *getCanonicalIV() {
- VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock();
- if (EntryVPBB->empty()) {
- // VPlan native path.
- EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor());
- }
- return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
+ return getVectorLoopRegion()->getCanonicalIV();
}
VPValue *getSCEVExpansion(const SCEV *S) const {
@@ -4212,9 +4230,8 @@ class VPlan {
/// to nullptr. If \p IsReplicator is true, the region is a replicate region.
/// The returned block is owned by the VPlan and deleted once the VPlan is
/// destroyed.
- VPRegionBlock *createVPRegionBlock(const std::string &Name = "",
- bool IsReplicator = false) {
- auto *VPB = new VPRegionBlock(Name, IsReplicator);
+ VPRegionBlock *createVPRegionBlock(const std::string &Name = "") {
+ auto *VPB = new VPRegionBlock(Name, false);
CreatedBlocks.push_back(VPB);
return VPB;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 76da5b0314a8e..8285995a96623 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -25,11 +25,8 @@ using namespace llvm;
VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan)
: Ctx(Plan.getScalarHeader()->getIRBasicBlock()->getContext()) {
if (auto LoopRegion = Plan.getVectorLoopRegion()) {
- if (const auto *CanIV = dyn_cast<VPCanonicalIVPHIRecipe>(
- &LoopRegion->getEntryBasicBlock()->front())) {
- CanonicalIVTy = CanIV->getScalarType();
- return;
- }
+ CanonicalIVTy = LoopRegion->getCanonicalIV()->getScalarType();
+ return;
}
// If there's no canonical IV, retrieve the type from the trip count
@@ -436,7 +433,13 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
// first and last occurrences of each recipe.
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
Plan.getVectorLoopRegion());
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ for (VPBlockBase *VPB : RPOT) {
+ if (auto *R = dyn_cast<VPRegionBlock>(VPB)) {
+ if (auto *CanIV = R->getCanonicalIV())
+ Idx2Recipe.push_back(CanIV);
+ continue;
+ }
+ auto *VPBB = cast<VPBasicBlock>(VPB);
if (!VPBB->getParent())
break;
for (VPRecipeBase &R : *VPBB) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 593e5063802ba..9dc5dabff35a8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -409,7 +409,7 @@ static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) {
// LatchExitVPB, taking care to preserve the original predecessor & successor
// order of blocks. Set region entry and exiting after both HeaderVPB and
// LatchVPBB have been disconnected from their predecessors/successors.
- auto *R = Plan.createVPRegionBlock("", false /*isReplicator*/);
+ auto *R = Plan.createVPRegionBlock("");
VPBlockUtils::insertOnEdge(LatchVPBB, LatchExitVPB, R);
VPBlockUtils::disconnectBlocks(LatchVPBB, R);
VPBlockUtils::connectBlocks(PreheaderVPBB, R);
@@ -426,29 +426,23 @@ static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) {
static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB,
VPBasicBlock *LatchVPBB, Type *IdxTy,
DebugLoc DL) {
- using namespace VPlanPatternMatch;
+ using namespace llvm::VPlanPatternMatch;
Value *StartIdx = ConstantInt::get(IdxTy, 0);
auto *StartV = Plan.getOrAddLiveIn(StartIdx);
- // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
HeaderVPBB->insert(CanonicalIVPHI, HeaderVPBB->begin());
- // We are about to replace the branch to exit the region. Remove the original
- // BranchOnCond, if there is any.
+ // We are about to replace the branch to exit the region. Remove the
+ // original BranchOnCond, if there is any.
if (!LatchVPBB->empty() &&
match(&LatchVPBB->back(), m_BranchOnCond(m_VPValue())))
LatchVPBB->getTerminator()->eraseFromParent();
VPBuilder Builder(LatchVPBB);
- // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
- // Initially the induction increment is guaranteed to not wrap, but that may
- // change later, e.g. when tail-folding, when the flags need to be dropped.
- auto *CanonicalIVIncrement = Builder.createOverflowingOp(
+ auto CanonicalIVIncrement = Builder.createOverflowingOp(
Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {true, false}, DL,
"index.next");
- CanonicalIVPHI->addOperand(CanonicalIVIncrement);
-
// Add the BranchOnCount VPInstruction to the latch.
Builder.createNaryOp(VPInstruction::BranchOnCount,
{CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index b2535fe3aa578..f7316e42f8836 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -414,6 +414,12 @@ m_c_Binary(const Op0_t &Op0, const Op1_t &Op1) {
return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, true>(Op0, Op1);
}
+template <typename Op0_t, typename Op1_t>
+inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Add>
+m_Add(const Op0_t &Op0, const Op1_t &Op1) {
+ return m_Binary<Instruction::Add, Op0_t, Op1_t>(Op0, Op1);
+}
+
template <typename Op0_t, typename Op1_t>
inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul>
m_Mul(const Op0_t &Op0, const Op1_t &Op1) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f3b5c8cfa9885..6559d94fc0dcd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1993,7 +1993,7 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
return false;
auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
- auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
+ auto *CanIV = getParent()->getParent()->getCanonicalIV();
return StartC && StartC->isZero() && StepC && StepC->isOne() &&
getScalarType() == CanIV->getScalarType();
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8b86b13d70a6d..4949eb26ce83a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1397,7 +1397,7 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
auto *CanIV = Plan.getCanonicalIV();
if (!match(Cond, m_Binary<Instruction::ICmp>(
- m_Specific(CanIV->getBackedgeValue()),
+ m_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),
m_Specific(&Plan.getVectorTripCount()))) ||
cast<VPRecipeWithIRFlags>(Cond->getDefiningRecipe())->getPredicate() !=
CmpInst::ICMP_EQ)
@@ -1458,14 +1458,14 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
// (BranchOnCond true).
auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
- if (all_of(
- Header->phis(),
- IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) {
+ if (all_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>)) {
for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(&HeaderR);
HeaderPhiR->replaceAllUsesWith(HeaderPhiR->getStartValue());
HeaderPhiR->eraseFromParent();
}
+ Plan.getCanonicalIV()->replaceAllUsesWith(
+ Plan.getCanonicalIV()->getStartValue());
VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
@@ -1959,11 +1959,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
VPValue *StartV = CanonicalIVPHI->getStartValue();
auto *CanonicalIVIncrement =
- cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
+ cast<VPInstruction>(EB->getTerminator()->getOperand(0));
// TODO: Check if dropping the flags is needed if
// !DataAndControlFlowWithoutRuntimeCheck.
CanonicalIVIncrement->dropPoisonGeneratingFlags();
- DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
+ DebugLoc DL = CanonicalIVPHI->getDebugLoc();
// We can't use StartV directly in the ActiveLaneMask VPInstruction, since
// we have to take unrolling into account. Each part needs to start at
// Part * VF
@@ -2000,7 +2000,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
// Now create the ActiveLaneMaskPhi recipe in the main loop using the
// preheader ActiveLaneMask instruction.
auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
- LaneMaskPhi->insertAfter(CanonicalIVPHI);
+ auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
+ LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());
// Create the active lane mask for the next iteration of the loop before the
// original terminator.
@@ -2299,10 +2300,15 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
auto *CanonicalIVPHI = Plan.getCanonicalIV();
VPValue *StartV = CanonicalIVPHI->getStartValue();
+ auto *CanonicalIVIncrement =
+ dyn_cast<VPInstruction>(Plan.getVectorLoopRegion()
+ ->getExitingBasicBlock()
+ ->getTerminator()
+ ->getOperand(0));
// Create the ExplicitVectorLengthPhi recipe in the main loop.
auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
- EVLPhi->insertAfter(CanonicalIVPHI);
+ EVLPhi->insertBefore(*Header, Header->begin());
VPBuilder Builder(Header, Header->getFirstNonPhi());
// Compute original TC - IV as the AVL (application vector length).
VPValue *AVL = Builder.createNaryOp(
@@ -2317,15 +2323,13 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
DebugLoc());
- auto *CanonicalIVIncrement =
- cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
Builder.setInsertPoint(CanonicalIVIncrement);
VPSingleDefRecipe *OpVPEVL = VPEVL;
if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits();
IVSize != 32) {
OpVPEVL = Builder.createScalarCast(
IVSize < 32 ? Instruction::Trunc : Instruction::ZExt, OpVPEVL,
- CanonicalIVPHI->getScalarType(), CanonicalIVIncrement->getDebugLoc());
+ CanonicalIVPHI->getScalarType(), CanonicalIVPHI->getDebugLoc());
}
auto *NextEVLIV = Builder.createOverflowingOp(
Instruction::Add, {OpVPEVL, EVLPhi},
@@ -2339,7 +2343,12 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
// Replace all uses of VPCanonicalIVPHIRecipe by
// VPEVLBasedIVPHIRecipe except for the canonical IV increment.
CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
+
CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
+ Plan.getVectorLoopRegion()
+ ->getExitingBasicBlock()
+ ->getTerminator()
+ ->setOperand(0, CanonicalIVIncrement);
// TODO: support unroll factor > 1.
Plan.setUF(1);
return true;
@@ -3309,7 +3318,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// Adjust induction to reflect that the transformed plan only processes one
// original iteration.
auto *CanIV = Plan.getCanonicalIV();
- auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
+ auto *Inc = cast<VPInstruction>(
+ VectorLoop->getExitingBasicBlock()->getTerminator()->getOperand(0));
Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get(
CanIV->getScalarType(), 1 * Plan.getUF())));
Plan.getVF().replaceAllUsesWith(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index dfb5bfabd22b8..c82c9d64281db 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -413,6 +413,7 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
UnrollState Unroller(Plan, UF, Ctx);
+ Unroller.addUniformForAllParts(Plan.getVectorLoopRegion()->getCanonicalIV());
// Iterate over all blocks in the plan starting from Entry, and unroll
// recipes inside them. This includes the vector preheader and middle blocks,
// which may set up or post-process per-part values.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 81bd21bb904c0..08a3292fff18e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -98,9 +98,8 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
return all_of(R->operands(), isUniformAcrossVFsAndUFs);
}
- auto *CanonicalIV = R->getParent()->getPlan()->getCanonicalIV();
// Canonical IV chain is uniform.
- if (V == CanonicalIV || V == CanonicalIV->getBackedgeValue())
+ if (isa<VPCanonicalIVPHIRecipe>(V))
return true;
return TypeSwitch<const VPRecipeBase *, bool>(R)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index fba4a68f4a27b..676c92860e63e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -429,12 +429,6 @@ bool VPlanVerifier::verify(const VPlan &Plan) {
return false;
}
- if (!isa<VPCanonicalIVPHIRecipe>(&*Entry->begin())) {
- errs() << "VPlan vector loop header does not start with a "
- "VPCanonicalIVPHIRecipe\n";
- return false;
- }
-
const VPBasicBlock *Exiting = dyn_cast<VPBasicBlock>(TopRegion->getExiting());
if (!Exiting) {
errs() << "VPlan exiting block is not a VPBasicBlock\n";
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 976f95ff4f0ba..db78b50bcb300 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -732,9 +732,9 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; PRED-NEXT: [[TMP14:%.*]] = uitofp <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x double>
; PRED-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
; PRED-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
; PRED-NEXT: [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
; PRED-NEXT: [[TMP17:%.*]] = extractelement <vscale x 2 x i1> [[TMP16]], i32 0
; PRED-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
@@ -862,8 +862,8 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; DEFAULT-NEXT: store i8 [[TMP33]], ptr [[TMP32]], align 1
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE14]]
; DEFAULT: [[PRED_STORE_CONTINUE14]]:
-; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
+; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: br label %[[EXIT:.*]]
@@ -964,8 +964,8 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; PRED-NEXT: store i8 [[TMP33]], ptr [[TMP32]], align 1
; PRED-NEXT: br label %[[PRED_STORE_CONTINUE14]]
; PRED: [[PRED_STORE_CONTINUE14]]:
-; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
+; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; PRED-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
; PRED-NEXT: br label %[[EXIT:.*]]
@@ -1398,10 +1398,10 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED-NEXT: store float 0.000000e+00, ptr [[TMP83]], align 4
; PRED-NEXT: br label %[[PRED_STORE_CONTINUE27]]
; PRED: [[PRED_STORE_CONTINUE27]]:
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX]], i64 [[TMP17]])
; PRED-NEXT: [[TMP84:%.*]] = xor <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; PRED-NEXT: [[TMP85:%.*]] = extractelement <8 x i1> [[TMP84]], i32 0
; PRED-NEXT: br i1 [[TMP85]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index d59607711b5bf..ed5872264ffbc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -151,10 +151,10 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
; CHECK-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP33]]
; CHECK-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[TMP34]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> zeroinitializer, ptr [[TMP35]], i32 8, <vscale x 2 x i1> [[TMP23]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
; CHECK-NEXT: [[TMP36:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
; CHECK-NEXT: [[TMP37:%.*]] = extractelement <vscale x 2 x i1> [[TMP36]], i32 0
; CHECK-NEXT: br i1 [[TMP37]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
@@ -277,10 +277,10 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP37]]
; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i64, ptr [[TMP38]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP23]], ptr [[TMP39]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
; CHECK-NEXT: [[TMP47:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
; CHECK-NEXT: [[TMP48:%.*]] = extractelement <vscale x 2 x i1> [[TMP47]], i32 0
; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index a4dc329c061e3..d63e4d9e419f3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -136,9 +136,9 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
; PRED-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
; PRED-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]])
; PRED-NEXT: [[TMP28:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
; PRED-NEXT: [[TMP29:%.*]] = extractelement <vscale x 16 x i1> [[TMP28]], i32 0
; PRED-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
@@ -306,10 +306,10 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
; PRED-NEXT: store i32 1, ptr [[TMP23]], align 4
; PRED-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; PRED: [[PRED_STORE_CONTINUE2]]:
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]])
; PRED-NEXT: [[TMP24:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; PRED-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; PRED-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[TMP24]], i32 0
; PRED-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
@@ -498,10 +498,10 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
; PRED-NEXT: store i32 [[TMP34]], ptr [[TMP33]], align 4
; PRED-NEXT: br label %[[PRED_STORE_CONTINUE7]]
; PRED: [[PRED_STORE_CONTINUE7]]:
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]])
; PRED-NEXT: [[TMP35:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; PRED-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP35]], i32 0
; PRED-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
@@ -692,10 +692,10 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
; PRED-NEXT: store i32 [[TMP33]], ptr [[TMP32]], align 4
; PRED-NEXT: br label %[[PRED_STORE_CONTINUE6]]
; PRED: [[PRED_STORE_CONTINUE6]]:
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]])
; PRED-NEXT: [[TMP34:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; PRED-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[TMP34]], i32 0
; PRED-NEXT: br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
index 078f98f54525b..70ed973de9f87 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
@@ -85,9 +85,9 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
; TFCOMMON-NEXT: store double [[TMP19]], ptr [[P]], align 8
; TFCOMMON-NEXT: br label [[PRED_STORE_CONTINUE6]]
; TFCOMMON: pred.store.continue2:
-; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP3]])
; TFCOMMON-NEXT: [[TMP18:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; TFCOMMON-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP18]], i32 0
; TFCOMMON-NEXT: br i1 [[TMP17]], label [[END:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
; TFCOMMON: end:
@@ -149,11 +149,11 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
; TFA_INTERLEAVE-NEXT: store double [[TMP34]], ptr [[P]], align 8
; TFA_INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE9]]
; TFA_INTERLEAVE: pred.store.continue9:
-; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 2
; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP3]])
; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT10]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[TMP27]], i64 [[TMP3]])
; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; TFA_INTERLEAVE-NEXT: [[TMP30:%.*]] = extractelement <2 x i1> [[TMP28]], i32 0
; TFA_INTERLEAVE-NEXT: br i1 [[TMP30]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; TFA_INTERLEAVE: end:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 2c0fb797d1d10..3210601a80936 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -1021,11 +1021,11 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
; TFA_INTERLEAVE-NEXT: store double [[SPEC_SELECT]], ptr [[P]], align 8
; TFA_INTERLEAVE-NEXT: br label %[[TMP19]]
; TFA_INTERLEAVE: [[TMP19]]:
-; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 1
; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP3]]
; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = icmp ult i64 [[TMP20]], [[TMP3]]
; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = xor i1 [[ACTIVE_LANE_MASK_NEXT]], true
+; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; TFA_INTERLEAVE-NEXT: br i1 [[TMP21]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; TFA_INTERLEAVE: [[END]]:
; TFA_INTERLEAVE-NEXT: ret void
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
index e4718dc216358..425772e0832f5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
@@ -394,9 +394,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
; DEFAULT-NEXT: store i8 [[TMP71]], ptr [[TMP70]], align 1
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE36]]
; DEFAULT: [[PRED_STORE_CONTINUE36]]:
-; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
; DEFAULT-NEXT: [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16)
+; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]]
@@ -542,10 +542,10 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
; DEFAULT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
; DEFAULT-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
; DEFAULT-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP23]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; DEFAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; DEFAULT-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; DEFAULT-NEXT: [[TMP24:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i8> [[VEC_IND]], [[DOTSPLAT]]
+; DEFAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; DEFAULT-NEXT: [[TMP25:%.*]] = extractelement <vscale x 16 x i1> [[TMP24]], i32 0
; DEFAULT-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
@@ -618,10 +618,10 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
; OPTSIZE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
; OPTSIZE-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
; OPTSIZE-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP23]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; OPTSIZE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; OPTSIZE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; OPTSIZE-NEXT: [[TMP24:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; OPTSIZE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i8> [[VEC_IND]], [[DOTSPLAT]]
+; OPTSIZE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; OPTSIZE-NEXT: [[TMP25:%.*]] = extractelement <vscale x 16 x i1> [[TMP24]], i32 0
; OPTSIZE-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; OPTSIZE: [[MIDDLE_BLOCK]]:
@@ -694,10 +694,10 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
; MINSIZE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
; MINSIZE-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
; MINSIZE-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP23]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; MINSIZE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; MINSIZE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; MINSIZE-NEXT: [[TMP24:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; MINSIZE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i8> [[VEC_IND]], [[DOTSPLAT]]
+; MINSIZE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; MINSIZE-NEXT: [[TMP25:%.*]] = extractelement <vscale x 16 x i1> [[TMP24]], i32 0
; MINSIZE-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; MINSIZE: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index b091452e28b4a..d6314f32a8fab 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1530,9 +1530,9 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
; CHECK-INTERLEAVE1-NEXT: [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
@@ -1575,9 +1575,9 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
; CHECK-INTERLEAVED-NEXT: [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
@@ -1620,9 +1620,9 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP16]], [[TMP13]]
; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP17]], <vscale x 16 x i32> zeroinitializer
; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP18]])
-; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = extractelement <vscale x 16 x i1> [[TMP19]], i32 0
; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-MAXBW: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index c2fe37ad214c6..6911f2f470ffa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -218,7 +218,7 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
; PRED-NEXT: [[VECTOR_RECUR_INIT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP27]]
; PRED-NEXT: br label %[[VECTOR_BODY:.*]]
; PRED: [[VECTOR_BODY]]:
-; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
; PRED-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[BROADCAST_SPLAT6:%.*]], %[[VECTOR_BODY]] ]
; PRED-NEXT: [[VECTOR_RECUR4:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT3]], %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ]
@@ -241,9 +241,9 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
; PRED-NEXT: [[TMP39:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]]
; PRED-NEXT: [[TMP40:%.*]] = or <vscale x 4 x i32> [[TMP39]], [[WIDE_MASKED_GATHER7]]
; PRED-NEXT: [[TMP41]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP40]], <vscale x 4 x i32> [[VEC_PHI]]
-; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[TMP5]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[IV]], i64 [[TMP10]])
; PRED-NEXT: [[TMP42:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[IV]], [[TMP5]]
; PRED-NEXT: [[TMP43:%.*]] = extractelement <vscale x 4 x i1> [[TMP42]], i32 0
; PRED-NEXT: br i1 [[TMP43]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
@@ -481,9 +481,9 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
; PRED-NEXT: [[TMP20:%.*]] = udiv <vscale x 8 x i16> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
; PRED-NEXT: [[TMP21:%.*]] = or <vscale x 8 x i16> [[TMP20]], [[VEC_PHI]]
; PRED-NEXT: [[TMP16]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[TMP21]], <vscale x 8 x i16> [[VEC_PHI]]
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP12]])
; PRED-NEXT: [[TMP17:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
; PRED-NEXT: [[TMP18:%.*]] = extractelement <vscale x 8 x i1> [[TMP17]], i32 0
; PRED-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index 82f64e988e6d3..2f529372c09a6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -150,9 +150,9 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP11]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> splat (float -0.000000e+00)
; CHECK-ORDERED-TF-NEXT: [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP12]])
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = extractelement <vscale x 8 x i1> [[TMP14]], i32 0
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-ORDERED-TF: middle.block:
@@ -402,7 +402,6 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP30]], <vscale x 8 x float> [[TMP31]])
; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[WIDE_MASKED_LOAD11]], <vscale x 8 x float> splat (float -0.000000e+00)
; CHECK-ORDERED-TF-NEXT: [[TMP34]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP32]], <vscale x 8 x float> [[TMP33]])
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = mul i64 [[TMP35]], 8
; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = add i64 [[INDEX]], [[TMP36]]
@@ -417,6 +416,7 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP40]], i64 [[TMP9]])
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP43]], i64 [[TMP9]])
; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = extractelement <vscale x 8 x i1> [[TMP44]], i32 0
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP45]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-ORDERED-TF: middle.block:
@@ -666,9 +666,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
; CHECK-ORDERED-TF-NEXT: [[TMP17]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP16]])
; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP14]], <vscale x 4 x float> splat (float -0.000000e+00)
; CHECK-ORDERED-TF-NEXT: [[TMP19]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], <vscale x 4 x float> [[TMP18]])
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]])
; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-ORDERED-TF: middle.block:
@@ -918,9 +918,9 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = fadd <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP15]], <vscale x 4 x float> splat (float -0.000000e+00)
; CHECK-ORDERED-TF-NEXT: [[TMP17]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP16]])
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP10]])
; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-ORDERED-TF: middle.block:
@@ -1154,9 +1154,9 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x float> [[WIDE_MASKED_LOAD1]], <vscale x 4 x float> splat (float 3.000000e+00)
; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> splat (float -0.000000e+00)
; CHECK-ORDERED-TF-NEXT: [[TMP17]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP16]])
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-ORDERED-TF: middle.block:
@@ -1625,7 +1625,6 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP45]], <vscale x 8 x float> [[TMP46]])
; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[TMP41]], <vscale x 8 x float> splat (float -0.000000e+00)
; CHECK-ORDERED-TF-NEXT: [[TMP49]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP47]], <vscale x 8 x float> [[TMP48]])
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = mul i64 [[TMP50]], 8
; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = add i64 [[INDEX]], [[TMP51]]
@@ -1640,6 +1639,7 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP55]], i64 [[TMP9]])
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP58]], i64 [[TMP9]])
; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = extractelement <vscale x 8 x i1> [[TMP59]], i32 0
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-ORDERED-TF: middle.block:
@@ -1953,7 +1953,6 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP45]], <vscale x 8 x float> [[TMP46]])
; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[TMP41]], <vscale x 8 x float> splat (float -0.000000e+00)
; CHECK-ORDERED-TF-NEXT: [[TMP49]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP47]], <vscale x 8 x float> [[TMP48]])
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = mul i64 [[TMP50]], 8
; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = add i64 [[INDEX]], [[TMP51]]
@@ -1968,6 +1967,7 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP55]], i64 [[TMP9]])
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP58]], i64 [[TMP9]])
; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = extractelement <vscale x 8 x i1> [[TMP59]], i32 0
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK-ORDERED-TF: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
index 15c5258b57cc9..43a1587b77b43 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -106,9 +106,9 @@ define void @cost_store_i8(ptr %dst) #0 {
; PRED-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
; PRED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> zeroinitializer, ptr [[TMP14]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
; PRED-NEXT: [[TMP15:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
; PRED-NEXT: [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[TMP15]], i32 0
; PRED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; PRED: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index bd2bd5aa27952..b1a23915f8ae9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -110,9 +110,9 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP16]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP17]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
; PREDICATED_TAIL_FOLDING: middle.block:
@@ -241,9 +241,9 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP12]]
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP10]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
; PREDICATED_TAIL_FOLDING: middle.block:
@@ -376,9 +376,9 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP13]] to <vscale x 16 x i64>
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP12]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
; PREDICATED_TAIL_FOLDING: middle.block:
@@ -537,9 +537,9 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP18]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
; PREDICATED_TAIL_FOLDING: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index 8fbb356c79742..4b8b24d88e32c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -69,9 +69,9 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
index a22f065415307..71b00153c32d9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
@@ -84,8 +84,8 @@ define void @can_overflow_i64_induction_var(ptr noalias %dst, ptr readonly %src,
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 42)
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
index f4639203d1036..fc90e81a0aa03 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
@@ -36,9 +36,9 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: [[TMP14]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
@@ -91,9 +91,9 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 {
; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
; CHECK-IN-LOOP-NEXT: [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI]]
-; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]]
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]]
; CHECK-IN-LOOP-NEXT: [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
; CHECK-IN-LOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-IN-LOOP: middle.block:
@@ -162,9 +162,9 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
; CHECK-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float -0.000000e+00)
; CHECK-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
@@ -215,9 +215,9 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 {
; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float -0.000000e+00)
; CHECK-IN-LOOP-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
-; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
; CHECK-IN-LOOP-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-IN-LOOP: middle.block:
@@ -291,9 +291,9 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
; CHECK-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD1]]
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[TMP17]], <vscale x 4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[TMP20]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP23:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]]
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <vscale x 4 x i1> [[TMP23]], i32 0
; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
@@ -358,9 +358,9 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD1]], <vscale x 4 x i32> zeroinitializer
; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP17]])
; CHECK-IN-LOOP-NEXT: [[TMP19]] = xor i32 [[TMP18]], [[VEC_PHI]]
-; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP21]]
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-IN-LOOP-NEXT: [[TMP22:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP21]]
; CHECK-IN-LOOP-NEXT: [[TMP23:%.*]] = extractelement <vscale x 4 x i1> [[TMP22]], i32 0
; CHECK-IN-LOOP-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-IN-LOOP: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
index 977115ce5321a..5558b71a2e483 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
@@ -60,7 +60,6 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP54]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]])
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP57]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]])
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP60]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]])
-; CHECK-NEXT: [[INDEX_NEXT10]] = add i64 [[INDEX6]], [[TMP62]]
; CHECK-NEXT: [[TMP63:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP64:%.*]] = mul i64 [[TMP63]], 4
; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX6]], [[TMP64]]
@@ -75,6 +74,7 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP68]], i64 [[TMP9]])
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP71]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP72:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[INDEX_NEXT10]] = add i64 [[INDEX6]], [[TMP62]]
; CHECK-NEXT: [[TMP76:%.*]] = extractelement <vscale x 4 x i1> [[TMP72]], i32 0
; CHECK-NEXT: br i1 [[TMP76]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
@@ -175,7 +175,6 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP76]], i32 4, <vscale x 4 x i1> [[TMP70]])
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP79]], i32 4, <vscale x 4 x i1> [[TMP71]])
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP82]], i32 4, <vscale x 4 x i1> [[TMP72]])
-; CHECK-NEXT: [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP84]]
; CHECK-NEXT: [[TMP85:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP86:%.*]] = mul i64 [[TMP85]], 4
; CHECK-NEXT: [[TMP87:%.*]] = add i64 [[INDEX6]], [[TMP86]]
@@ -190,6 +189,7 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP90]], i64 [[TMP9]])
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP93]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP94:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP84]]
; CHECK-NEXT: [[TMP98:%.*]] = extractelement <vscale x 4 x i1> [[TMP94]], i32 0
; CHECK-NEXT: br i1 [[TMP98]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index 46dcf23ee9cff..da990964d2a13 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -33,9 +33,9 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP13:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[TMP13]], i32 0
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
@@ -80,9 +80,9 @@ define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], 4
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX1]], i64 [[TMP2]])
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], 4
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
@@ -135,9 +135,9 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[INDEX1]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
@@ -200,10 +200,10 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP20]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP7]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP12]])
; CHECK-NEXT: [[TMP21:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP7]]
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[TMP21]], i32 0
; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: middle.block:
@@ -259,9 +259,9 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: middle.block:
@@ -319,9 +319,9 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[TMP14]], i32 0
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: middle.block:
@@ -385,9 +385,9 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX1]]
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP17]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: middle.block:
@@ -453,9 +453,9 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n)
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP13:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[TMP13]], i32 0
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: middle.block:
@@ -512,9 +512,9 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
; CHECK-NEXT: [[TMP15:%.*]] = fdiv <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP15]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP4]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP16:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP4]]
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[TMP16]], i32 0
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: middle.block:
@@ -575,9 +575,9 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD2]], <vscale x 4 x i32> splat (i32 1)
; CHECK-NEXT: [[TMP16:%.*]] = udiv <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[TMP15]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP16]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP4]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP4]]
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
index 9257e45c809e9..6ad1df3ec03d3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
@@ -563,8 +563,8 @@ define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indic
; CHECK-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
index f6443b1e28ba6..955f034b03c61 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
@@ -24,9 +24,9 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr %dst, i64 [[IDX]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 4
; CHECK-NEXT: [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX]], i64 [[N2]])
; CHECK-NEXT: [[NOT_ACTIVE_LANE_MASK:%.*]] = xor <4 x i1> [[NEXT_ACTIVE_LANE_MASK]], splat (i1 true)
+; CHECK-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 4
; CHECK-NEXT: [[FIRST_LANE_SET:%.*]] = extractelement <4 x i1> [[NOT_ACTIVE_LANE_MASK]], i32 0
; CHECK-NEXT: br i1 [[FIRST_LANE_SET]], label %middle.block, label %vector.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
index 71d03afa6b6f1..41d89d82dadf5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@@ -222,9 +222,9 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; DATA_AND_CONTROL_NO_RT_CHECK: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
index fea57fa8b6b68..022789ad9de70 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
@@ -107,8 +107,8 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) {
; VF4-NEXT: store i64 [[TMP32]], ptr [[TMP31]], align 8
; VF4-NEXT: br label %[[PRED_STORE_CONTINUE6]]
; VF4: [[PRED_STORE_CONTINUE6]]:
-; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
+; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; VF4-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; VF4: [[MIDDLE_BLOCK]]:
; VF4-NEXT: br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
index ce8492cd77362..787fb90815278 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
@@ -25,8 +25,8 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6
; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: for.cond.cleanup:
@@ -63,12 +63,12 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6
; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[DOTIDX5]]
; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP11]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
-; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; INTERLEAVE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; INTERLEAVE-NEXT: [[TMP16:%.*]] = shl i64 [[TMP15]], 1
; INTERLEAVE-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]]
; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP17]], i64 [[TMP4]])
+; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; INTERLEAVE-NEXT: [[TMP18:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; INTERLEAVE-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
; INTERLEAVE: for.cond.cleanup:
@@ -111,8 +111,8 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3
; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.cond.cleanup:
@@ -149,12 +149,12 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3
; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[DOTIDX5]]
; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP11]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
-; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; INTERLEAVE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; INTERLEAVE-NEXT: [[TMP16:%.*]] = shl i64 [[TMP15]], 1
; INTERLEAVE-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]]
; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP17]], i64 [[TMP4]])
+; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; INTERLEAVE-NEXT: [[TMP18:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; INTERLEAVE-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
; INTERLEAVE: for.cond.cleanup:
@@ -227,10 +227,10 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
; INTERLEAVE-NEXT: store double [[TMP8]], ptr [[TMP9]], align 8
; INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE4]]
; INTERLEAVE: pred.store.continue4:
-; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; INTERLEAVE-NEXT: [[TMP10:%.*]] = or disjoint i64 [[INDEX]], 1
; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]]
; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = icmp ult i64 [[TMP10]], [[TMP0]]
+; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; INTERLEAVE-NEXT: br i1 [[ACTIVE_LANE_MASK_NEXT]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
; INTERLEAVE: for.cond.cleanup:
; INTERLEAVE-NEXT: ret void
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
index 51a8b451dffd9..3dc17e615048e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -137,8 +137,8 @@ define void @test(ptr %p, i64 %a, i8 %b) {
; CHECK-NEXT: store i8 [[TMP40]], ptr [[P]], align 1
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]]
; CHECK: pred.store.continue32:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i32> [[VEC_IND]], splat (i32 16)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[EXIT1:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
index 05a495d51c458..7d9ed7d6215c5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -110,8 +110,8 @@ define void @redundant_or_1(ptr %dst, i1 %c.0, i1 %c.1) {
; CHECK-NEXT: store i32 0, ptr [[TMP17]], align 4
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]]
; CHECK: pred.store.continue8:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[EXIT:%.*]]
@@ -214,8 +214,8 @@ define void @redundant_or_2(ptr %dst, i1 %c.0, i1 %c.1) {
; CHECK-NEXT: store i32 0, ptr [[TMP16]], align 4
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]]
; CHECK: pred.store.continue8:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
index 53fd2ed43972c..38b58fbfd1021 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
@@ -39,8 +39,8 @@ define void @drop_scalar_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
;
@@ -91,8 +91,8 @@ define void @drop_scalar_gep_nusw(ptr noalias nocapture readonly %input, ptr %ou
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr nusw float, ptr [[OUTPUT]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr nusw float, ptr [[TMP5]], i32 0
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
;
@@ -143,8 +143,8 @@ define void @drop_scalar_gep_nuw(ptr noalias nocapture readonly %input, ptr %out
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr nuw float, ptr [[OUTPUT]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr nuw float, ptr [[TMP5]], i32 0
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
;
@@ -196,8 +196,8 @@ define void @drop_nonpred_scalar_nuw_nsw(ptr noalias nocapture readonly %input,
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
;
@@ -248,8 +248,8 @@ define void @preserve_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
;
@@ -305,8 +305,8 @@ define void @drop_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP9]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
;
@@ -356,8 +356,8 @@ define void @preserve_nuw_nsw_no_addr(ptr %output) local_unnamed_addr #0 {
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
; CHECK-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
;
@@ -409,8 +409,8 @@ define void @drop_scalar_exact(ptr noalias nocapture readonly %input, ptr %outpu
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP9]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
;
@@ -518,8 +518,8 @@ define void @preserve_vector_exact_no_addr(ptr noalias nocapture readonly %input
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP8]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
;
@@ -570,8 +570,8 @@ define void @preserve_exact_no_addr(ptr %output) local_unnamed_addr #0 {
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
; CHECK-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
;
@@ -698,8 +698,8 @@ define void @pr70590_recipe_without_underlying_instr(i64 %n, ptr noalias %dst) {
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
; CHECK-NEXT: store <4 x i8> [[PREDPHI]], ptr [[TMP16]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
;
@@ -778,8 +778,8 @@ define void @recipe_without_underlying_instr_lanes_used(i64 %n, ptr noalias %dst
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
; CHECK-NEXT: store <4 x i8> [[PREDPHI]], ptr [[TMP11]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index 7f2544ddf149d..4fbee321b6a48 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -519,8 +519,8 @@ define void @interleave_store_double_i64(ptr %dst) {
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -640,8 +640,8 @@ define void @interleave_store_i64_double_2(ptr %dst) {
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
index e27734755dfb2..a622193290c8f 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
@@ -59,8 +59,8 @@ define void @ptr_depends_on_sdiv(ptr noalias %dst, i16 noundef %off) {
; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]]
; CHECK: [[PRED_STORE_CONTINUE4]]:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -132,8 +132,8 @@ define void @ptr_depends_on_possibly_poison_value(ptr noalias %dst, i16 %off) {
; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP12]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; CHECK: [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -203,8 +203,8 @@ define void @ptr_doesnt_depend_on_poison_or_ub(ptr noalias %dst, i16 noundef %of
; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP12]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; CHECK: [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -276,8 +276,8 @@ define void @ptr_depends_on_possibly_poison_value_from_load(ptr noalias %dst) {
; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP13]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; CHECK: [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -350,8 +350,8 @@ define void @ptr_depends_on_noundef_load(ptr noalias %dst) {
; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP13]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; CHECK: [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index fe533672f2ca2..130db548ca8cb 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -291,8 +291,8 @@ define void @redundant_branch_and_blends_without_mask(ptr %A) {
; CHECK-NEXT: store i32 [[TMP34]], ptr [[TMP8]], align 4
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]]
; CHECK: [[PRED_STORE_CONTINUE12]]:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
More information about the llvm-commits
mailing list