[llvm] [VPlan] Make canonical IV part of the region (PR #156262)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Sun Aug 31 13:08:21 PDT 2025
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/156262
>From 1246f74568c9726f271a510dd857b5a58a95d30d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 25 Aug 2025 22:22:04 +0100
Subject: [PATCH 1/3] [VPlan] Track VPValues instead of VPRecipes in
calculateRegisterUsage.
Update calculateRegisterUsageForPlan to track live-ness of VPValues
instead of recipes. This gives slightly more accurate results for
recipes that define multiple values (i.e. VPInterleaveRecipe).
When tracking the live-ness of recipes, all VPValues defined by an
VPInterleaveRecipe are considered alive until the last use of any of
them. When tracking the live-ness of individual VPValues, we can
accurately track the individual values until their last use.
Note the changes in large-loop-rdx.ll and pr47437.ll. This patch
restores the original behavior before introducing VPlan-based liveness
tracking.
---
.../Transforms/Vectorize/VPlanAnalysis.cpp | 74 ++---
.../LoopVectorize/AArch64/reg-usage.ll | 2 +-
.../LoopVectorize/PowerPC/large-loop-rdx.ll | 262 ++++++++++++++----
.../Transforms/LoopVectorize/X86/pr47437.ll | 108 ++++++--
4 files changed, 338 insertions(+), 108 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 747c6623aa22a..8e5997fa8befe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -10,6 +10,7 @@
#include "VPlan.h"
#include "VPlanCFG.h"
#include "VPlanDominatorTree.h"
+#include "VPlanHelpers.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Analysis/ScalarEvolution.h"
@@ -396,7 +397,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
/// Get the VF scaling factor applied to the recipe's output, if the recipe has
/// one.
-static unsigned getVFScaleFactor(VPRecipeBase *R) {
+static unsigned getVFScaleFactor(VPValue *R) {
if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
return RR->getVFScaleFactor();
if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
@@ -422,15 +423,15 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
// Each 'key' in the map opens a new interval. The values
// of the map are the index of the 'last seen' usage of the
- // recipe that is the key.
- using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned, 16>;
+ // VPValue that is the key.
+ using IntervalMap = SmallDenseMap<VPValue *, unsigned, 16>;
// Maps indices to recipes.
SmallVector<VPRecipeBase *, 64> Idx2Recipe;
// Marks the end of each interval.
IntervalMap EndPoint;
- // Saves the list of recipe indices that are used in the loop.
- SmallPtrSet<VPRecipeBase *, 8> Ends;
+ // Saves the list of VPValues that are used in the loop.
+ SmallPtrSet<VPValue *, 8> Ends;
// Saves the list of values that are used in the loop but are defined outside
// the loop (not including non-recipe values such as arguments and
// constants).
@@ -441,7 +442,7 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
// each recipe. We use RPO to ensure that defs are met before their users. We
// assume that each recipe that has in-loop users starts an interval. We
// record every time that an in-loop value is used, so we have a list of the
- // first and last occurrences of each recipe.
+ // first occurences of each recipe and last occurrence of each VPValue.
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
LoopRegion);
@@ -470,32 +471,32 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
}
// Overwrite previous end points.
- EndPoint[DefR] = Idx2Recipe.size();
- Ends.insert(DefR);
+ EndPoint[U] = Idx2Recipe.size();
+ Ends.insert(U);
}
}
if (VPBB == LoopRegion->getExiting()) {
// VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
// exiting block, where their increment will get materialized eventually.
for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
- if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
- EndPoint[&R] = Idx2Recipe.size();
- Ends.insert(&R);
+ if (auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+ EndPoint[WideIV] = Idx2Recipe.size();
+ Ends.insert(WideIV);
}
}
}
}
// Saves the list of intervals that end with the index in 'key'.
- using RecipeList = SmallVector<VPRecipeBase *, 2>;
- SmallDenseMap<unsigned, RecipeList, 16> TransposeEnds;
+ using VPValueList = SmallVector<VPValue *, 2>;
+ SmallDenseMap<unsigned, VPValueList, 16> TransposeEnds;
// Next, we transpose the EndPoints into a multi map that holds the list of
// intervals that *end* at a specific location.
for (auto &Interval : EndPoint)
TransposeEnds[Interval.second].push_back(Interval.first);
- SmallPtrSet<VPRecipeBase *, 8> OpenIntervals;
+ SmallPtrSet<VPValue *, 8> OpenIntervals;
SmallVector<VPRegisterUsage, 8> RUs(VFs.size());
SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
@@ -519,14 +520,16 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
for (unsigned int Idx = 0, Sz = Idx2Recipe.size(); Idx < Sz; ++Idx) {
VPRecipeBase *R = Idx2Recipe[Idx];
- // Remove all of the recipes that end at this location.
- RecipeList &List = TransposeEnds[Idx];
- for (VPRecipeBase *ToRemove : List)
+ // Remove all of the VPValues that end at this location.
+ VPValueList &List = TransposeEnds[Idx];
+ for (VPValue *ToRemove : List)
OpenIntervals.erase(ToRemove);
// Ignore recipes that are never used within the loop and do not have side
// effects.
- if (!Ends.count(R) && !R->mayHaveSideEffects())
+ if (all_of(R->definedValues(),
+ [&Ends](VPValue *Def) { return !Ends.count(Def); }) &&
+ !R->mayHaveSideEffects())
continue;
// Skip recipes for ignored values.
@@ -546,41 +549,38 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
// there is no previous entry for ClassID.
SmallMapVector<unsigned, unsigned, 4> RegUsage;
- for (auto *R : OpenIntervals) {
- // Skip recipes that weren't present in the original loop.
+ for (auto *VPV : OpenIntervals) {
+ // Skip values that weren't present in the original loop.
// TODO: Remove after removing the legacy
// LoopVectorizationCostModel::calculateRegisterUsage
if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
- VPBranchOnMaskRecipe>(R))
+ VPBranchOnMaskRecipe>(VPV))
continue;
if (VFs[J].isScalar() ||
isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
- VPEVLBasedIVPHIRecipe, VPScalarIVStepsRecipe>(R) ||
- (isa<VPInstruction>(R) &&
- vputils::onlyScalarValuesUsed(cast<VPSingleDefRecipe>(R))) ||
- (isa<VPReductionPHIRecipe>(R) &&
- (cast<VPReductionPHIRecipe>(R))->isInLoop())) {
- unsigned ClassID = TTI.getRegisterClassForType(
- false, TypeInfo.inferScalarType(R->getVPSingleValue()));
+ VPEVLBasedIVPHIRecipe, VPScalarIVStepsRecipe>(VPV) ||
+ (isa<VPInstruction>(VPV) && vputils::onlyScalarValuesUsed(VPV)) ||
+ (isa<VPReductionPHIRecipe>(VPV) &&
+ (cast<VPReductionPHIRecipe>(VPV))->isInLoop())) {
+ unsigned ClassID =
+ TTI.getRegisterClassForType(false, TypeInfo.inferScalarType(VPV));
// FIXME: The target might use more than one register for the type
// even in the scalar case.
RegUsage[ClassID] += 1;
} else {
// The output from scaled phis and scaled reductions actually has
// fewer lanes than the VF.
- unsigned ScaleFactor = getVFScaleFactor(R);
+ unsigned ScaleFactor = getVFScaleFactor(VPV);
ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
LLVM_DEBUG(if (VF != VFs[J]) {
dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
<< " for " << *R << "\n";
});
- for (VPValue *DefV : R->definedValues()) {
- Type *ScalarTy = TypeInfo.inferScalarType(DefV);
- unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
- RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
- }
+ Type *ScalarTy = TypeInfo.inferScalarType(VPV);
+ unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
+ RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
}
}
@@ -593,8 +593,10 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
<< OpenIntervals.size() << '\n');
- // Add the current recipe to the list of open intervals.
- OpenIntervals.insert(R);
+ // Add the VPValues defined by the current recipe to the list of open
+ // intervals.
+ for (VPValue *DefV : R->definedValues())
+ OpenIntervals.insert(DefV);
}
// We also search for instructions that are defined outside the loop, but are
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
index e51a925040a49..23cc7e367776a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
@@ -74,7 +74,7 @@ define dso_local void @dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, i32
; CHECK: LV(REG): VF = 16
; CHECK-NEXT: LV(REG): Found max usage: 2 item
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers
-; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 48 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 47 registers
; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
entry:
%cmp100 = icmp sgt i32 %n, 0
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
index 0b23206134bc0..43cce8005bbf6 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
@@ -10,28 +10,43 @@ define void @QLA_F3_r_veq_norm2_V(ptr noalias %r, ptr noalias %a, i32 %n) {
; CHECK-SAME: ptr noalias [[R:%.*]], ptr noalias [[A:%.*]], i32 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[CMP24:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-NEXT: br i1 [[CMP24]], label %[[FOR_COND1_PREHEADER_PREHEADER:.*]], label %[[FOR_END13:.*]]
-; CHECK: [[FOR_COND1_PREHEADER_PREHEADER]]:
+; CHECK-NEXT: br i1 [[CMP24]], label %[[ITER_CHECK:.*]], label %[[FOR_END13:.*]]
+; CHECK: [[ITER_CHECK]]:
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP69:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP65:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP66:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP67:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP65:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP66:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP131:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP25:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP132:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP26:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP133:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP27:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP134:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP28:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP135:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 10
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 14
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDEX]], i64 0, i32 0
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP1]], i64 0, i32 0
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP2]], i64 0, i32 0
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP3]], i64 0, i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP4]], i64 0, i32 0
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP5]], i64 0, i32 0
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP6]], i64 0, i32 0
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP7]], i64 0, i32 0
; CHECK-NEXT: [[WIDE_VEC35:%.*]] = load <12 x float>, ptr [[TMP13]], align 8
; CHECK-NEXT: [[STRIDED_VEC36:%.*]] = shufflevector <12 x float> [[WIDE_VEC35]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
; CHECK-NEXT: [[STRIDED_VEC37:%.*]] = shufflevector <12 x float> [[WIDE_VEC35]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
@@ -60,116 +75,252 @@ define void @QLA_F3_r_veq_norm2_V(ptr noalias %r, ptr noalias %a, i32 %n) {
; CHECK-NEXT: [[STRIDED_VEC60:%.*]] = shufflevector <12 x float> [[WIDE_VEC56]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
; CHECK-NEXT: [[STRIDED_VEC61:%.*]] = shufflevector <12 x float> [[WIDE_VEC56]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
; CHECK-NEXT: [[STRIDED_VEC62:%.*]] = shufflevector <12 x float> [[WIDE_VEC56]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
+; CHECK-NEXT: [[WIDE_VEC36:%.*]] = load <12 x float>, ptr [[TMP12]], align 8
+; CHECK-NEXT: [[STRIDED_VEC42:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
+; CHECK-NEXT: [[STRIDED_VEC49:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT: [[STRIDED_VEC56:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 2, i32 8>
+; CHECK-NEXT: [[STRIDED_VEC63:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
+; CHECK-NEXT: [[STRIDED_VEC64:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
+; CHECK-NEXT: [[STRIDED_VEC65:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
+; CHECK-NEXT: [[WIDE_VEC43:%.*]] = load <12 x float>, ptr [[TMP17]], align 8
+; CHECK-NEXT: [[STRIDED_VEC66:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
+; CHECK-NEXT: [[STRIDED_VEC67:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT: [[STRIDED_VEC68:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 2, i32 8>
+; CHECK-NEXT: [[STRIDED_VEC69:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
+; CHECK-NEXT: [[STRIDED_VEC70:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
+; CHECK-NEXT: [[STRIDED_VEC71:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
+; CHECK-NEXT: [[WIDE_VEC50:%.*]] = load <12 x float>, ptr [[TMP18]], align 8
+; CHECK-NEXT: [[STRIDED_VEC72:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
+; CHECK-NEXT: [[STRIDED_VEC73:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT: [[STRIDED_VEC80:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 2, i32 8>
+; CHECK-NEXT: [[STRIDED_VEC81:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
+; CHECK-NEXT: [[STRIDED_VEC82:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
+; CHECK-NEXT: [[STRIDED_VEC83:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
+; CHECK-NEXT: [[WIDE_VEC57:%.*]] = load <12 x float>, ptr [[TMP19]], align 8
+; CHECK-NEXT: [[STRIDED_VEC84:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
+; CHECK-NEXT: [[STRIDED_VEC85:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT: [[STRIDED_VEC86:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 2, i32 8>
+; CHECK-NEXT: [[STRIDED_VEC87:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
+; CHECK-NEXT: [[STRIDED_VEC88:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
+; CHECK-NEXT: [[STRIDED_VEC89:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
; CHECK-NEXT: [[TMP64:%.*]] = fmul fast <2 x float> [[STRIDED_VEC36]], [[STRIDED_VEC36]]
; CHECK-NEXT: [[TMP97:%.*]] = fmul fast <2 x float> [[STRIDED_VEC43]], [[STRIDED_VEC43]]
; CHECK-NEXT: [[TMP98:%.*]] = fmul fast <2 x float> [[STRIDED_VEC50]], [[STRIDED_VEC50]]
; CHECK-NEXT: [[TMP99:%.*]] = fmul fast <2 x float> [[STRIDED_VEC57]], [[STRIDED_VEC57]]
+; CHECK-NEXT: [[TMP100:%.*]] = fmul fast <2 x float> [[STRIDED_VEC42]], [[STRIDED_VEC42]]
+; CHECK-NEXT: [[TMP101:%.*]] = fmul fast <2 x float> [[STRIDED_VEC66]], [[STRIDED_VEC66]]
+; CHECK-NEXT: [[TMP102:%.*]] = fmul fast <2 x float> [[STRIDED_VEC72]], [[STRIDED_VEC72]]
+; CHECK-NEXT: [[TMP103:%.*]] = fmul fast <2 x float> [[STRIDED_VEC84]], [[STRIDED_VEC84]]
; CHECK-NEXT: [[TMP72:%.*]] = fmul fast <2 x float> [[STRIDED_VEC37]], [[STRIDED_VEC37]]
; CHECK-NEXT: [[TMP105:%.*]] = fmul fast <2 x float> [[STRIDED_VEC44]], [[STRIDED_VEC44]]
; CHECK-NEXT: [[TMP106:%.*]] = fmul fast <2 x float> [[STRIDED_VEC51]], [[STRIDED_VEC51]]
; CHECK-NEXT: [[TMP107:%.*]] = fmul fast <2 x float> [[STRIDED_VEC58]], [[STRIDED_VEC58]]
+; CHECK-NEXT: [[TMP108:%.*]] = fmul fast <2 x float> [[STRIDED_VEC49]], [[STRIDED_VEC49]]
+; CHECK-NEXT: [[TMP109:%.*]] = fmul fast <2 x float> [[STRIDED_VEC67]], [[STRIDED_VEC67]]
+; CHECK-NEXT: [[TMP110:%.*]] = fmul fast <2 x float> [[STRIDED_VEC73]], [[STRIDED_VEC73]]
+; CHECK-NEXT: [[TMP111:%.*]] = fmul fast <2 x float> [[STRIDED_VEC85]], [[STRIDED_VEC85]]
; CHECK-NEXT: [[TMP80:%.*]] = fadd fast <2 x float> [[TMP72]], [[TMP64]]
; CHECK-NEXT: [[TMP113:%.*]] = fadd fast <2 x float> [[TMP105]], [[TMP97]]
; CHECK-NEXT: [[TMP114:%.*]] = fadd fast <2 x float> [[TMP106]], [[TMP98]]
; CHECK-NEXT: [[TMP115:%.*]] = fadd fast <2 x float> [[TMP107]], [[TMP99]]
-; CHECK-NEXT: [[TMP21:%.*]] = fpext <2 x float> [[TMP80]] to <2 x double>
-; CHECK-NEXT: [[TMP22:%.*]] = fpext <2 x float> [[TMP113]] to <2 x double>
-; CHECK-NEXT: [[TMP23:%.*]] = fpext <2 x float> [[TMP114]] to <2 x double>
-; CHECK-NEXT: [[TMP24:%.*]] = fpext <2 x float> [[TMP115]] to <2 x double>
-; CHECK-NEXT: [[TMP25:%.*]] = fadd fast <2 x double> [[TMP21]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP26:%.*]] = fadd fast <2 x double> [[TMP22]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP27:%.*]] = fadd fast <2 x double> [[TMP23]], [[VEC_PHI2]]
-; CHECK-NEXT: [[TMP28:%.*]] = fadd fast <2 x double> [[TMP24]], [[VEC_PHI3]]
-; CHECK-NEXT: [[TMP100:%.*]] = fmul fast <2 x float> [[STRIDED_VEC38]], [[STRIDED_VEC38]]
-; CHECK-NEXT: [[TMP101:%.*]] = fmul fast <2 x float> [[STRIDED_VEC45]], [[STRIDED_VEC45]]
-; CHECK-NEXT: [[TMP102:%.*]] = fmul fast <2 x float> [[STRIDED_VEC52]], [[STRIDED_VEC52]]
-; CHECK-NEXT: [[TMP103:%.*]] = fmul fast <2 x float> [[STRIDED_VEC59]], [[STRIDED_VEC59]]
-; CHECK-NEXT: [[TMP108:%.*]] = fmul fast <2 x float> [[STRIDED_VEC39]], [[STRIDED_VEC39]]
-; CHECK-NEXT: [[TMP109:%.*]] = fmul fast <2 x float> [[STRIDED_VEC46]], [[STRIDED_VEC46]]
-; CHECK-NEXT: [[TMP110:%.*]] = fmul fast <2 x float> [[STRIDED_VEC53]], [[STRIDED_VEC53]]
-; CHECK-NEXT: [[TMP111:%.*]] = fmul fast <2 x float> [[STRIDED_VEC60]], [[STRIDED_VEC60]]
; CHECK-NEXT: [[TMP116:%.*]] = fadd fast <2 x float> [[TMP108]], [[TMP100]]
; CHECK-NEXT: [[TMP117:%.*]] = fadd fast <2 x float> [[TMP109]], [[TMP101]]
; CHECK-NEXT: [[TMP118:%.*]] = fadd fast <2 x float> [[TMP110]], [[TMP102]]
; CHECK-NEXT: [[TMP119:%.*]] = fadd fast <2 x float> [[TMP111]], [[TMP103]]
+; CHECK-NEXT: [[TMP40:%.*]] = fpext <2 x float> [[TMP80]] to <2 x double>
+; CHECK-NEXT: [[TMP52:%.*]] = fpext <2 x float> [[TMP113]] to <2 x double>
+; CHECK-NEXT: [[TMP53:%.*]] = fpext <2 x float> [[TMP114]] to <2 x double>
+; CHECK-NEXT: [[TMP54:%.*]] = fpext <2 x float> [[TMP115]] to <2 x double>
; CHECK-NEXT: [[TMP41:%.*]] = fpext <2 x float> [[TMP116]] to <2 x double>
; CHECK-NEXT: [[TMP42:%.*]] = fpext <2 x float> [[TMP117]] to <2 x double>
; CHECK-NEXT: [[TMP43:%.*]] = fpext <2 x float> [[TMP118]] to <2 x double>
; CHECK-NEXT: [[TMP44:%.*]] = fpext <2 x float> [[TMP119]] to <2 x double>
+; CHECK-NEXT: [[TMP55:%.*]] = fadd fast <2 x double> [[TMP40]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP49:%.*]] = fadd fast <2 x double> [[TMP52]], [[VEC_PHI2]]
+; CHECK-NEXT: [[TMP50:%.*]] = fadd fast <2 x double> [[TMP53]], [[VEC_PHI3]]
+; CHECK-NEXT: [[TMP51:%.*]] = fadd fast <2 x double> [[TMP54]], [[VEC_PHI4]]
; CHECK-NEXT: [[TMP45:%.*]] = fadd fast <2 x double> [[TMP41]], [[TMP25]]
; CHECK-NEXT: [[TMP46:%.*]] = fadd fast <2 x double> [[TMP42]], [[TMP26]]
; CHECK-NEXT: [[TMP47:%.*]] = fadd fast <2 x double> [[TMP43]], [[TMP27]]
; CHECK-NEXT: [[TMP48:%.*]] = fadd fast <2 x double> [[TMP44]], [[TMP28]]
-; CHECK-NEXT: [[TMP104:%.*]] = fmul fast <2 x float> [[STRIDED_VEC40]], [[STRIDED_VEC40]]
-; CHECK-NEXT: [[TMP142:%.*]] = fmul fast <2 x float> [[STRIDED_VEC47]], [[STRIDED_VEC47]]
-; CHECK-NEXT: [[TMP147:%.*]] = fmul fast <2 x float> [[STRIDED_VEC54]], [[STRIDED_VEC54]]
-; CHECK-NEXT: [[TMP152:%.*]] = fmul fast <2 x float> [[STRIDED_VEC61]], [[STRIDED_VEC61]]
-; CHECK-NEXT: [[TMP112:%.*]] = fmul fast <2 x float> [[STRIDED_VEC41]], [[STRIDED_VEC41]]
-; CHECK-NEXT: [[TMP143:%.*]] = fmul fast <2 x float> [[STRIDED_VEC48]], [[STRIDED_VEC48]]
-; CHECK-NEXT: [[TMP148:%.*]] = fmul fast <2 x float> [[STRIDED_VEC55]], [[STRIDED_VEC55]]
-; CHECK-NEXT: [[TMP153:%.*]] = fmul fast <2 x float> [[STRIDED_VEC62]], [[STRIDED_VEC62]]
+; CHECK-NEXT: [[TMP104:%.*]] = fmul fast <2 x float> [[STRIDED_VEC38]], [[STRIDED_VEC38]]
+; CHECK-NEXT: [[TMP142:%.*]] = fmul fast <2 x float> [[STRIDED_VEC45]], [[STRIDED_VEC45]]
+; CHECK-NEXT: [[TMP147:%.*]] = fmul fast <2 x float> [[STRIDED_VEC52]], [[STRIDED_VEC52]]
+; CHECK-NEXT: [[TMP152:%.*]] = fmul fast <2 x float> [[STRIDED_VEC59]], [[STRIDED_VEC59]]
+; CHECK-NEXT: [[TMP60:%.*]] = fmul fast <2 x float> [[STRIDED_VEC56]], [[STRIDED_VEC56]]
+; CHECK-NEXT: [[TMP67:%.*]] = fmul fast <2 x float> [[STRIDED_VEC68]], [[STRIDED_VEC68]]
+; CHECK-NEXT: [[TMP73:%.*]] = fmul fast <2 x float> [[STRIDED_VEC80]], [[STRIDED_VEC80]]
+; CHECK-NEXT: [[TMP74:%.*]] = fmul fast <2 x float> [[STRIDED_VEC86]], [[STRIDED_VEC86]]
+; CHECK-NEXT: [[TMP112:%.*]] = fmul fast <2 x float> [[STRIDED_VEC39]], [[STRIDED_VEC39]]
+; CHECK-NEXT: [[TMP143:%.*]] = fmul fast <2 x float> [[STRIDED_VEC46]], [[STRIDED_VEC46]]
+; CHECK-NEXT: [[TMP148:%.*]] = fmul fast <2 x float> [[STRIDED_VEC53]], [[STRIDED_VEC53]]
+; CHECK-NEXT: [[TMP153:%.*]] = fmul fast <2 x float> [[STRIDED_VEC60]], [[STRIDED_VEC60]]
+; CHECK-NEXT: [[TMP75:%.*]] = fmul fast <2 x float> [[STRIDED_VEC63]], [[STRIDED_VEC63]]
+; CHECK-NEXT: [[TMP81:%.*]] = fmul fast <2 x float> [[STRIDED_VEC69]], [[STRIDED_VEC69]]
+; CHECK-NEXT: [[TMP70:%.*]] = fmul fast <2 x float> [[STRIDED_VEC81]], [[STRIDED_VEC81]]
+; CHECK-NEXT: [[TMP71:%.*]] = fmul fast <2 x float> [[STRIDED_VEC87]], [[STRIDED_VEC87]]
; CHECK-NEXT: [[TMP120:%.*]] = fadd fast <2 x float> [[TMP112]], [[TMP104]]
; CHECK-NEXT: [[TMP144:%.*]] = fadd fast <2 x float> [[TMP143]], [[TMP142]]
; CHECK-NEXT: [[TMP149:%.*]] = fadd fast <2 x float> [[TMP148]], [[TMP147]]
; CHECK-NEXT: [[TMP154:%.*]] = fadd fast <2 x float> [[TMP153]], [[TMP152]]
+; CHECK-NEXT: [[TMP76:%.*]] = fadd fast <2 x float> [[TMP75]], [[TMP60]]
+; CHECK-NEXT: [[TMP77:%.*]] = fadd fast <2 x float> [[TMP81]], [[TMP67]]
+; CHECK-NEXT: [[TMP78:%.*]] = fadd fast <2 x float> [[TMP70]], [[TMP73]]
+; CHECK-NEXT: [[TMP79:%.*]] = fadd fast <2 x float> [[TMP71]], [[TMP74]]
; CHECK-NEXT: [[TMP61:%.*]] = fpext <2 x float> [[TMP120]] to <2 x double>
; CHECK-NEXT: [[TMP62:%.*]] = fpext <2 x float> [[TMP144]] to <2 x double>
; CHECK-NEXT: [[TMP63:%.*]] = fpext <2 x float> [[TMP149]] to <2 x double>
; CHECK-NEXT: [[TMP155:%.*]] = fpext <2 x float> [[TMP154]] to <2 x double>
-; CHECK-NEXT: [[TMP69]] = fadd fast <2 x double> [[TMP61]], [[TMP45]]
-; CHECK-NEXT: [[TMP65]] = fadd fast <2 x double> [[TMP62]], [[TMP46]]
-; CHECK-NEXT: [[TMP66]] = fadd fast <2 x double> [[TMP63]], [[TMP47]]
-; CHECK-NEXT: [[TMP67]] = fadd fast <2 x double> [[TMP155]], [[TMP48]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP84:%.*]] = fpext <2 x float> [[TMP76]] to <2 x double>
+; CHECK-NEXT: [[TMP85:%.*]] = fpext <2 x float> [[TMP77]] to <2 x double>
+; CHECK-NEXT: [[TMP86:%.*]] = fpext <2 x float> [[TMP78]] to <2 x double>
+; CHECK-NEXT: [[TMP87:%.*]] = fpext <2 x float> [[TMP79]] to <2 x double>
+; CHECK-NEXT: [[TMP88:%.*]] = fadd fast <2 x double> [[TMP61]], [[TMP55]]
+; CHECK-NEXT: [[TMP89:%.*]] = fadd fast <2 x double> [[TMP62]], [[TMP49]]
+; CHECK-NEXT: [[TMP90:%.*]] = fadd fast <2 x double> [[TMP63]], [[TMP50]]
+; CHECK-NEXT: [[TMP91:%.*]] = fadd fast <2 x double> [[TMP155]], [[TMP51]]
+; CHECK-NEXT: [[TMP92:%.*]] = fadd fast <2 x double> [[TMP84]], [[TMP45]]
+; CHECK-NEXT: [[TMP93:%.*]] = fadd fast <2 x double> [[TMP85]], [[TMP46]]
+; CHECK-NEXT: [[TMP94:%.*]] = fadd fast <2 x double> [[TMP86]], [[TMP47]]
+; CHECK-NEXT: [[TMP95:%.*]] = fadd fast <2 x double> [[TMP87]], [[TMP48]]
+; CHECK-NEXT: [[TMP96:%.*]] = fmul fast <2 x float> [[STRIDED_VEC40]], [[STRIDED_VEC40]]
+; CHECK-NEXT: [[TMP128:%.*]] = fmul fast <2 x float> [[STRIDED_VEC47]], [[STRIDED_VEC47]]
+; CHECK-NEXT: [[TMP129:%.*]] = fmul fast <2 x float> [[STRIDED_VEC54]], [[STRIDED_VEC54]]
+; CHECK-NEXT: [[TMP130:%.*]] = fmul fast <2 x float> [[STRIDED_VEC61]], [[STRIDED_VEC61]]
+; CHECK-NEXT: [[TMP136:%.*]] = fmul fast <2 x float> [[STRIDED_VEC64]], [[STRIDED_VEC64]]
+; CHECK-NEXT: [[TMP137:%.*]] = fmul fast <2 x float> [[STRIDED_VEC70]], [[STRIDED_VEC70]]
+; CHECK-NEXT: [[TMP139:%.*]] = fmul fast <2 x float> [[STRIDED_VEC82]], [[STRIDED_VEC82]]
+; CHECK-NEXT: [[TMP157:%.*]] = fmul fast <2 x float> [[STRIDED_VEC88]], [[STRIDED_VEC88]]
+; CHECK-NEXT: [[TMP159:%.*]] = fmul fast <2 x float> [[STRIDED_VEC41]], [[STRIDED_VEC41]]
+; CHECK-NEXT: [[TMP160:%.*]] = fmul fast <2 x float> [[STRIDED_VEC48]], [[STRIDED_VEC48]]
+; CHECK-NEXT: [[TMP161:%.*]] = fmul fast <2 x float> [[STRIDED_VEC55]], [[STRIDED_VEC55]]
+; CHECK-NEXT: [[TMP162:%.*]] = fmul fast <2 x float> [[STRIDED_VEC62]], [[STRIDED_VEC62]]
+; CHECK-NEXT: [[TMP163:%.*]] = fmul fast <2 x float> [[STRIDED_VEC65]], [[STRIDED_VEC65]]
+; CHECK-NEXT: [[TMP164:%.*]] = fmul fast <2 x float> [[STRIDED_VEC71]], [[STRIDED_VEC71]]
+; CHECK-NEXT: [[TMP165:%.*]] = fmul fast <2 x float> [[STRIDED_VEC83]], [[STRIDED_VEC83]]
+; CHECK-NEXT: [[TMP166:%.*]] = fmul fast <2 x float> [[STRIDED_VEC89]], [[STRIDED_VEC89]]
+; CHECK-NEXT: [[TMP167:%.*]] = fadd fast <2 x float> [[TMP159]], [[TMP96]]
+; CHECK-NEXT: [[TMP168:%.*]] = fadd fast <2 x float> [[TMP160]], [[TMP128]]
+; CHECK-NEXT: [[TMP169:%.*]] = fadd fast <2 x float> [[TMP161]], [[TMP129]]
+; CHECK-NEXT: [[TMP170:%.*]] = fadd fast <2 x float> [[TMP162]], [[TMP130]]
+; CHECK-NEXT: [[TMP171:%.*]] = fadd fast <2 x float> [[TMP163]], [[TMP136]]
+; CHECK-NEXT: [[TMP172:%.*]] = fadd fast <2 x float> [[TMP164]], [[TMP137]]
+; CHECK-NEXT: [[TMP173:%.*]] = fadd fast <2 x float> [[TMP165]], [[TMP139]]
+; CHECK-NEXT: [[TMP174:%.*]] = fadd fast <2 x float> [[TMP166]], [[TMP157]]
+; CHECK-NEXT: [[TMP175:%.*]] = fpext <2 x float> [[TMP167]] to <2 x double>
+; CHECK-NEXT: [[TMP121:%.*]] = fpext <2 x float> [[TMP168]] to <2 x double>
+; CHECK-NEXT: [[TMP122:%.*]] = fpext <2 x float> [[TMP169]] to <2 x double>
+; CHECK-NEXT: [[TMP123:%.*]] = fpext <2 x float> [[TMP170]] to <2 x double>
+; CHECK-NEXT: [[TMP124:%.*]] = fpext <2 x float> [[TMP171]] to <2 x double>
+; CHECK-NEXT: [[TMP125:%.*]] = fpext <2 x float> [[TMP172]] to <2 x double>
+; CHECK-NEXT: [[TMP126:%.*]] = fpext <2 x float> [[TMP173]] to <2 x double>
+; CHECK-NEXT: [[TMP127:%.*]] = fpext <2 x float> [[TMP174]] to <2 x double>
+; CHECK-NEXT: [[TMP69]] = fadd fast <2 x double> [[TMP175]], [[TMP88]]
+; CHECK-NEXT: [[TMP65]] = fadd fast <2 x double> [[TMP121]], [[TMP89]]
+; CHECK-NEXT: [[TMP66]] = fadd fast <2 x double> [[TMP122]], [[TMP90]]
+; CHECK-NEXT: [[TMP131]] = fadd fast <2 x double> [[TMP123]], [[TMP91]]
+; CHECK-NEXT: [[TMP132]] = fadd fast <2 x double> [[TMP124]], [[TMP92]]
+; CHECK-NEXT: [[TMP133]] = fadd fast <2 x double> [[TMP125]], [[TMP93]]
+; CHECK-NEXT: [[TMP134]] = fadd fast <2 x double> [[TMP126]], [[TMP94]]
+; CHECK-NEXT: [[TMP135]] = fadd fast <2 x double> [[TMP127]], [[TMP95]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP68]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP65]], [[TMP69]]
; CHECK-NEXT: [[BIN_RDX30:%.*]] = fadd fast <2 x double> [[TMP66]], [[BIN_RDX]]
-; CHECK-NEXT: [[TMP156:%.*]] = fadd fast <2 x double> [[TMP67]], [[BIN_RDX30]]
+; CHECK-NEXT: [[BIN_RDX64:%.*]] = fadd fast <2 x double> [[TMP131]], [[BIN_RDX30]]
+; CHECK-NEXT: [[BIN_RDX65:%.*]] = fadd fast <2 x double> [[TMP132]], [[BIN_RDX64]]
+; CHECK-NEXT: [[BIN_RDX66:%.*]] = fadd fast <2 x double> [[TMP133]], [[BIN_RDX65]]
+; CHECK-NEXT: [[BIN_RDX67:%.*]] = fadd fast <2 x double> [[TMP134]], [[BIN_RDX66]]
+; CHECK-NEXT: [[TMP156:%.*]] = fadd fast <2 x double> [[TMP135]], [[BIN_RDX67]]
; CHECK-NEXT: [[TMP158:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP156]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_FOR_END13_CRIT_EDGE:.*]], label %[[SCALAR_PH]]
-; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP158]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_FOR_END13_CRIT_EDGE:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK: [[VEC_EPILOG_PH]]:
+; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP158]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: [[N_MOD_VF69:%.*]] = urem i64 [[TMP0]], 2
+; CHECK-NEXT: [[N_VEC70:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF69]]
+; CHECK-NEXT: [[TMP138:%.*]] = insertelement <2 x double> zeroinitializer, double [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT80:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI72:%.*]] = phi <2 x double> [ [[TMP138]], %[[VEC_EPILOG_PH]] ], [ [[TMP176:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX5_REALP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 0, i32 0
+; CHECK-NEXT: [[WIDE_VEC73:%.*]] = load <12 x float>, ptr [[ARRAYIDX5_REALP]], align 8
+; CHECK-NEXT: [[STRIDED_VEC74:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
+; CHECK-NEXT: [[STRIDED_VEC75:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT: [[STRIDED_VEC76:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 2, i32 8>
+; CHECK-NEXT: [[STRIDED_VEC77:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
+; CHECK-NEXT: [[STRIDED_VEC78:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
+; CHECK-NEXT: [[STRIDED_VEC79:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
+; CHECK-NEXT: [[TMP140:%.*]] = fmul fast <2 x float> [[STRIDED_VEC74]], [[STRIDED_VEC74]]
+; CHECK-NEXT: [[TMP141:%.*]] = fmul fast <2 x float> [[STRIDED_VEC75]], [[STRIDED_VEC75]]
+; CHECK-NEXT: [[TMP177:%.*]] = fadd fast <2 x float> [[TMP141]], [[TMP140]]
+; CHECK-NEXT: [[TMP178:%.*]] = fpext <2 x float> [[TMP177]] to <2 x double>
+; CHECK-NEXT: [[TMP179:%.*]] = fadd fast <2 x double> [[TMP178]], [[VEC_PHI72]]
+; CHECK-NEXT: [[TMP145:%.*]] = fmul fast <2 x float> [[STRIDED_VEC76]], [[STRIDED_VEC76]]
+; CHECK-NEXT: [[TMP146:%.*]] = fmul fast <2 x float> [[STRIDED_VEC77]], [[STRIDED_VEC77]]
+; CHECK-NEXT: [[TMP180:%.*]] = fadd fast <2 x float> [[TMP146]], [[TMP145]]
+; CHECK-NEXT: [[TMP181:%.*]] = fpext <2 x float> [[TMP180]] to <2 x double>
+; CHECK-NEXT: [[TMP182:%.*]] = fadd fast <2 x double> [[TMP181]], [[TMP179]]
+; CHECK-NEXT: [[TMP150:%.*]] = fmul fast <2 x float> [[STRIDED_VEC78]], [[STRIDED_VEC78]]
+; CHECK-NEXT: [[TMP151:%.*]] = fmul fast <2 x float> [[STRIDED_VEC79]], [[STRIDED_VEC79]]
+; CHECK-NEXT: [[TMP183:%.*]] = fadd fast <2 x float> [[TMP151]], [[TMP150]]
+; CHECK-NEXT: [[TMP184:%.*]] = fpext <2 x float> [[TMP183]] to <2 x double>
+; CHECK-NEXT: [[TMP176]] = fadd fast <2 x double> [[TMP184]], [[TMP182]]
+; CHECK-NEXT: [[INDEX_NEXT80]] = add nuw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT: [[TMP185:%.*]] = icmp eq i64 [[INDEX_NEXT80]], [[N_VEC70]]
+; CHECK-NEXT: br i1 [[TMP185]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP186:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP176]])
+; CHECK-NEXT: [[CMP_N81:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC70]]
+; CHECK-NEXT: br i1 [[CMP_N81]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC70]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX82:%.*]] = phi double [ [[TMP186]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP158]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, %[[ITER_CHECK]] ]
; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER:.*]]
; CHECK: [[FOR_COND1_PREHEADER]]:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; CHECK-NEXT: [[SUM_026:%.*]] = phi double [ [[ADD10_2:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX5_REALP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 0, i32 0
-; CHECK-NEXT: [[ARRAYIDX5_REAL:%.*]] = load float, ptr [[ARRAYIDX5_REALP]], align 8
-; CHECK-NEXT: [[ARRAYIDX5_IMAGP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 0, i32 1
+; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT: [[SUM_026:%.*]] = phi double [ [[ADD10_2:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_MERGE_RDX82]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX5_REALP1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 0, i32 0
+; CHECK-NEXT: [[ARRAYIDX5_REAL:%.*]] = load float, ptr [[ARRAYIDX5_REALP1]], align 8
+; CHECK-NEXT: [[ARRAYIDX5_IMAGP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 0, i32 1
; CHECK-NEXT: [[ARRAYIDX5_IMAG:%.*]] = load float, ptr [[ARRAYIDX5_IMAGP]], align 8
; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[ARRAYIDX5_REAL]], [[ARRAYIDX5_REAL]]
; CHECK-NEXT: [[MUL9:%.*]] = fmul fast float [[ARRAYIDX5_IMAG]], [[ARRAYIDX5_IMAG]]
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL9]], [[MUL]]
; CHECK-NEXT: [[CONV:%.*]] = fpext float [[ADD]] to double
; CHECK-NEXT: [[ADD10:%.*]] = fadd fast double [[CONV]], [[SUM_026]]
-; CHECK-NEXT: [[ARRAYIDX5_REALP_1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 1, i32 0
+; CHECK-NEXT: [[ARRAYIDX5_REALP_1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 1, i32 0
; CHECK-NEXT: [[ARRAYIDX5_REAL_1:%.*]] = load float, ptr [[ARRAYIDX5_REALP_1]], align 8
-; CHECK-NEXT: [[ARRAYIDX5_IMAGP_1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 1, i32 1
+; CHECK-NEXT: [[ARRAYIDX5_IMAGP_1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 1, i32 1
; CHECK-NEXT: [[ARRAYIDX5_IMAG_1:%.*]] = load float, ptr [[ARRAYIDX5_IMAGP_1]], align 8
; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[ARRAYIDX5_REAL_1]], [[ARRAYIDX5_REAL_1]]
; CHECK-NEXT: [[MUL9_1:%.*]] = fmul fast float [[ARRAYIDX5_IMAG_1]], [[ARRAYIDX5_IMAG_1]]
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL9_1]], [[MUL_1]]
; CHECK-NEXT: [[CONV_1:%.*]] = fpext float [[ADD_1]] to double
; CHECK-NEXT: [[ADD10_1:%.*]] = fadd fast double [[CONV_1]], [[ADD10]]
-; CHECK-NEXT: [[ARRAYIDX5_REALP_2:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 2, i32 0
+; CHECK-NEXT: [[ARRAYIDX5_REALP_2:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 2, i32 0
; CHECK-NEXT: [[ARRAYIDX5_REAL_2:%.*]] = load float, ptr [[ARRAYIDX5_REALP_2]], align 8
-; CHECK-NEXT: [[ARRAYIDX5_IMAGP_2:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 2, i32 1
+; CHECK-NEXT: [[ARRAYIDX5_IMAGP_2:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 2, i32 1
; CHECK-NEXT: [[ARRAYIDX5_IMAG_2:%.*]] = load float, ptr [[ARRAYIDX5_IMAGP_2]], align 8
; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[ARRAYIDX5_REAL_2]], [[ARRAYIDX5_REAL_2]]
; CHECK-NEXT: [[MUL9_2:%.*]] = fmul fast float [[ARRAYIDX5_IMAG_2]], [[ARRAYIDX5_IMAG_2]]
; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[MUL9_2]], [[MUL_2]]
; CHECK-NEXT: [[CONV_2:%.*]] = fpext float [[ADD_2]] to double
; CHECK-NEXT: [[ADD10_2]] = fadd fast double [[CONV_2]], [[ADD10_1]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1
; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[FOR_COND_FOR_END13_CRIT_EDGE]]:
-; CHECK-NEXT: [[ADD10_2_LCSSA:%.*]] = phi double [ [[ADD10_2]], %[[FOR_COND1_PREHEADER]] ], [ [[TMP158]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[ADD10_2_LCSSA:%.*]] = phi double [ [[ADD10_2]], %[[FOR_COND1_PREHEADER]] ], [ [[TMP158]], %[[MIDDLE_BLOCK]] ], [ [[TMP186]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[PHITMP:%.*]] = fptrunc double [[ADD10_2_LCSSA]] to float
; CHECK-NEXT: br label %[[FOR_END13]]
; CHECK: [[FOR_END13]]:
@@ -234,5 +385,6 @@ for.end13: ; preds = %for.cond.for.end13_
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
index d2f8f2203b724..5d16ce5346bbf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
@@ -169,64 +169,140 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
; AVX1-NEXT: entry:
; AVX1-NEXT: [[CMP30:%.*]] = icmp sgt i32 [[N:%.*]], 0
; AVX1-NEXT: br i1 [[CMP30]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; AVX1: for.body.preheader:
+; AVX1: iter.check:
; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
+; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AVX1: vector.main.loop.iter.check:
+; AVX1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
; AVX1: vector.ph:
-; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX1: vector.body:
-; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; AVX1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 8
+; AVX1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 12
; AVX1-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[INDEX]], 1
; AVX1-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+; AVX1-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP3]], 1
+; AVX1-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP8]], 1
; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP7]]
; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP2]]
+; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP5]]
+; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP6]]
; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2
; AVX1-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; AVX1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP4]], align 2
; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; AVX1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT: [[WIDE_VEC6:%.*]] = load <8 x i16>, ptr [[TMP9]], align 2
+; AVX1-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT: [[WIDE_VEC9:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2
+; AVX1-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <8 x i16> [[WIDE_VEC9]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <8 x i16> [[WIDE_VEC9]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
; AVX1-NEXT: [[TMP36:%.*]] = sext <4 x i16> [[STRIDED_VEC5]] to <4 x i32>
; AVX1-NEXT: [[TMP37:%.*]] = sext <4 x i16> [[STRIDED_VEC6]] to <4 x i32>
+; AVX1-NEXT: [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32>
+; AVX1-NEXT: [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC12]] to <4 x i32>
; AVX1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP7]]
; AVX1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP2]]
+; AVX1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP5]]
+; AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP6]]
; AVX1-NEXT: [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP22]], align 2
; AVX1-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; AVX1-NEXT: [[STRIDED_VEC21:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
; AVX1-NEXT: [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP23]], align 2
; AVX1-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; AVX1-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT: [[WIDE_VEC18:%.*]] = load <8 x i16>, ptr [[TMP17]], align 2
+; AVX1-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <8 x i16> [[WIDE_VEC18]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <8 x i16> [[WIDE_VEC18]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT: [[WIDE_VEC21:%.*]] = load <8 x i16>, ptr [[TMP18]], align 2
+; AVX1-NEXT: [[STRIDED_VEC24:%.*]] = shufflevector <8 x i16> [[WIDE_VEC21]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT: [[STRIDED_VEC23:%.*]] = shufflevector <8 x i16> [[WIDE_VEC21]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
; AVX1-NEXT: [[TMP40:%.*]] = sext <4 x i16> [[STRIDED_VEC17]] to <4 x i32>
; AVX1-NEXT: [[TMP41:%.*]] = sext <4 x i16> [[STRIDED_VEC18]] to <4 x i32>
+; AVX1-NEXT: [[TMP42:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32>
+; AVX1-NEXT: [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC24]] to <4 x i32>
; AVX1-NEXT: [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP40]], [[TMP36]]
; AVX1-NEXT: [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP41]], [[TMP37]]
-; AVX1-NEXT: [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
-; AVX1-NEXT: [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
-; AVX1-NEXT: [[TMP42:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32>
-; AVX1-NEXT: [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32>
; AVX1-NEXT: [[TMP46:%.*]] = mul nsw <4 x i32> [[TMP42]], [[TMP38]]
; AVX1-NEXT: [[TMP47:%.*]] = mul nsw <4 x i32> [[TMP43]], [[TMP39]]
-; AVX1-NEXT: [[TMP19:%.*]] = add nsw <4 x i32> [[TMP46]], [[TMP44]]
-; AVX1-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP45]]
+; AVX1-NEXT: [[TMP27:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
+; AVX1-NEXT: [[TMP28:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
+; AVX1-NEXT: [[TMP29:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
+; AVX1-NEXT: [[TMP30:%.*]] = sext <4 x i16> [[STRIDED_VEC11]] to <4 x i32>
+; AVX1-NEXT: [[TMP31:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32>
+; AVX1-NEXT: [[TMP32:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32>
+; AVX1-NEXT: [[TMP33:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32>
+; AVX1-NEXT: [[TMP34:%.*]] = sext <4 x i16> [[STRIDED_VEC23]] to <4 x i32>
+; AVX1-NEXT: [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP27]]
+; AVX1-NEXT: [[TMP60:%.*]] = mul nsw <4 x i32> [[TMP32]], [[TMP28]]
+; AVX1-NEXT: [[TMP67:%.*]] = mul nsw <4 x i32> [[TMP33]], [[TMP29]]
+; AVX1-NEXT: [[TMP68:%.*]] = mul nsw <4 x i32> [[TMP34]], [[TMP30]]
+; AVX1-NEXT: [[TMP19:%.*]] = add nsw <4 x i32> [[TMP35]], [[TMP44]]
+; AVX1-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[TMP60]], [[TMP45]]
+; AVX1-NEXT: [[TMP69:%.*]] = add nsw <4 x i32> [[TMP67]], [[TMP46]]
+; AVX1-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[TMP68]], [[TMP47]]
; AVX1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[INDEX]]
; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 4
+; AVX1-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 8
+; AVX1-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 12
; AVX1-NEXT: store <4 x i32> [[TMP19]], ptr [[TMP21]], align 4
; AVX1-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP26]], align 4
-; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; AVX1-NEXT: store <4 x i32> [[TMP69]], ptr [[TMP71]], align 4
+; AVX1-NEXT: store <4 x i32> [[TMP70]], ptr [[TMP72]], align 4
+; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; AVX1-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX1-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; AVX1: middle.block:
; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; AVX1: scalar.ph:
-; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; AVX1: vec.epilog.iter.check:
+; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; AVX1: vec.epilog.ph:
+; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; AVX1-NEXT: [[N_MOD_VF24:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; AVX1-NEXT: [[N_VEC25:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF24]]
; AVX1-NEXT: br label [[FOR_BODY1:%.*]]
+; AVX1: vec.epilog.vector.body:
+; AVX1-NEXT: [[INDEX26:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT33:%.*]], [[FOR_BODY1]] ]
+; AVX1-NEXT: [[TMP48:%.*]] = shl nuw nsw i64 [[INDEX26]], 1
+; AVX1-NEXT: [[TMP49:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP48]]
+; AVX1-NEXT: [[WIDE_VEC27:%.*]] = load <8 x i16>, ptr [[TMP49]], align 2
+; AVX1-NEXT: [[STRIDED_VEC28:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT: [[STRIDED_VEC29:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT: [[TMP50:%.*]] = sext <4 x i16> [[STRIDED_VEC28]] to <4 x i32>
+; AVX1-NEXT: [[TMP51:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP48]]
+; AVX1-NEXT: [[WIDE_VEC30:%.*]] = load <8 x i16>, ptr [[TMP51]], align 2
+; AVX1-NEXT: [[STRIDED_VEC31:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT: [[STRIDED_VEC32:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT: [[TMP52:%.*]] = sext <4 x i16> [[STRIDED_VEC31]] to <4 x i32>
+; AVX1-NEXT: [[TMP53:%.*]] = mul nsw <4 x i32> [[TMP52]], [[TMP50]]
+; AVX1-NEXT: [[TMP54:%.*]] = sext <4 x i16> [[STRIDED_VEC29]] to <4 x i32>
+; AVX1-NEXT: [[TMP55:%.*]] = sext <4 x i16> [[STRIDED_VEC32]] to <4 x i32>
+; AVX1-NEXT: [[TMP56:%.*]] = mul nsw <4 x i32> [[TMP55]], [[TMP54]]
+; AVX1-NEXT: [[TMP57:%.*]] = add nsw <4 x i32> [[TMP56]], [[TMP53]]
+; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDEX26]]
+; AVX1-NEXT: store <4 x i32> [[TMP57]], ptr [[TMP58]], align 4
+; AVX1-NEXT: [[INDEX_NEXT33]] = add nuw i64 [[INDEX26]], 4
+; AVX1-NEXT: [[TMP59:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC25]]
+; AVX1-NEXT: br i1 [[TMP59]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
+; AVX1: vec.epilog.middle.block:
+; AVX1-NEXT: [[CMP_N34:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC25]]
+; AVX1-NEXT: br i1 [[CMP_N34]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; AVX1: vec.epilog.scalar.ph:
+; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
-; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ]
+; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; AVX1-NEXT: [[TMP61:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP61]]
; AVX1-NEXT: [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
@@ -248,7 +324,7 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
; AVX1-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4
; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
+; AVX1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; AVX1: for.end.loopexit:
; AVX1-NEXT: br label [[FOR_END]]
; AVX1: for.end:
>From 7c900b2ed295e832bdf5ba81debfab0f768a0aea Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 29 Aug 2025 11:38:46 +0100
Subject: [PATCH 2/3] Don't reset canonical IV start value
---
.../Transforms/Vectorize/LoopVectorize.cpp | 166 +++++++++---------
.../Transforms/Vectorize/VPlanPatternMatch.h | 6 +
.../Transforms/Vectorize/VPlanTransforms.cpp | 11 ++
3 files changed, 101 insertions(+), 82 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1bfd70ee53c7c..d09e54dc4212c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9719,26 +9719,22 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
Header->setName("vec.epilog.vector.body");
- DenseMap<Value *, Value *> ToFrozen;
- // Ensure that the start values for all header phi recipes are updated before
- // vectorizing the epilogue loop.
- for (VPRecipeBase &R : Header->phis()) {
- if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
- // When vectorizing the epilogue loop, the canonical induction start
- // value needs to be changed from zero to the value after the main
- // vector loop. Find the resume value created during execution of the main
- // VPlan. It must be the first phi in the loop preheader.
- // FIXME: Improve modeling for canonical IV start values in the epilogue
- // loop.
- using namespace llvm::PatternMatch;
- PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
- for (Value *Inc : EPResumeVal->incoming_values()) {
- if (match(Inc, m_SpecificInt(0)))
- continue;
- assert(!EPI.VectorTripCount &&
- "Must only have a single non-zero incoming value");
- EPI.VectorTripCount = Inc;
- }
+ auto *IV = cast<VPCanonicalIVPHIRecipe>(&*Header->begin());
+ // When vectorizing the epilogue loop, the canonical induction start
+ // value needs to be changed from zero to the value after the main
+ // vector loop. Find the resume value created during execution of the main
+ // VPlan. It must be the first phi in the loop preheader.
+ // FIXME: Improve modeling for canonical IV start values in the epilogue
+ // loop.
+ using namespace llvm::PatternMatch;
+ PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
+ for (Value *Inc : EPResumeVal->incoming_values()) {
+ if (match(Inc, m_SpecificInt(0)))
+ continue;
+ assert(!EPI.VectorTripCount &&
+ "Must only have a single non-zero incoming value");
+ EPI.VectorTripCount = Inc;
+ }
// If we didn't find a non-zero vector trip count, all incoming values
// must be zero, which also means the vector trip count is zero. Pick the
// first zero as vector trip count.
@@ -9763,70 +9759,76 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
}) &&
"the canonical IV should only be used by its increment or "
"ScalarIVSteps when resetting the start value");
- IV->setOperand(0, VPV);
- continue;
- }
-
- Value *ResumeV = nullptr;
- // TODO: Move setting of resume values to prepareToExecute.
- if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
- auto *RdxResult =
- cast<VPInstruction>(*find_if(ReductionPhi->users(), [](VPUser *U) {
- auto *VPI = dyn_cast<VPInstruction>(U);
- return VPI &&
- (VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
- VPI->getOpcode() == VPInstruction::ComputeReductionResult ||
- VPI->getOpcode() == VPInstruction::ComputeFindIVResult);
- }));
- ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
- ->getIncomingValueForBlock(L->getLoopPreheader());
- RecurKind RK = ReductionPhi->getRecurrenceKind();
- if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
- Value *StartV = RdxResult->getOperand(1)->getLiveInIRValue();
- // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
- // start value; compare the final value from the main vector loop
- // to the start value.
- BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
- IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
- ResumeV = Builder.CreateICmpNE(ResumeV, StartV);
- } else if (RecurrenceDescriptor::isFindIVRecurrenceKind(RK)) {
- Value *StartV = getStartValueFromReductionResult(RdxResult);
- ToFrozen[StartV] = cast<PHINode>(ResumeV)->getIncomingValueForBlock(
- EPI.MainLoopIterationCountCheck);
-
- // VPReductionPHIRecipe for FindFirstIV/FindLastIV reductions requires
- // an adjustment to the resume value. The resume value is adjusted to
- // the sentinel value when the final value from the main vector loop
- // equals the start value. This ensures correctness when the start value
- // might not be less than the minimum value of a monotonically
- // increasing induction variable.
- BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
- IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
- Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
- Value *Sentinel = RdxResult->getOperand(2)->getLiveInIRValue();
- ResumeV = Builder.CreateSelect(Cmp, Sentinel, ResumeV);
- } else {
- VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
- auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
- if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
- assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
- "unexpected start value");
- VPI->setOperand(0, StartVal);
- continue;
+ VPBuilder Builder(Header, Header->getFirstNonPhi());
+ VPInstruction *Add = Builder.createNaryOp(Instruction::Add, {IV, VPV});
+ IV->replaceAllUsesWith(Add);
+ Add->setOperand(0, IV);
+
+ DenseMap<Value *, Value *> ToFrozen;
+ // Ensure that the start values for all header phi recipes are updated
+ // before vectorizing the epilogue loop.
+ for (VPRecipeBase &R : drop_begin(Header->phis())) {
+ Value *ResumeV = nullptr;
+ // TODO: Move setting of resume values to prepareToExecute.
+ if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
+ auto *RdxResult = cast<VPInstruction>(
+ *find_if(ReductionPhi->users(), [](VPUser *U) {
+ auto *VPI = dyn_cast<VPInstruction>(U);
+ return VPI &&
+ (VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
+ VPI->getOpcode() ==
+ VPInstruction::ComputeReductionResult ||
+ VPI->getOpcode() == VPInstruction::ComputeFindIVResult);
+ }));
+ ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
+ ->getIncomingValueForBlock(L->getLoopPreheader());
+ RecurKind RK = ReductionPhi->getRecurrenceKind();
+ if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
+ Value *StartV = RdxResult->getOperand(1)->getLiveInIRValue();
+ // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
+ // start value; compare the final value from the main vector loop
+ // to the start value.
+ BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
+ IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
+ ResumeV = Builder.CreateICmpNE(ResumeV, StartV);
+ } else if (RecurrenceDescriptor::isFindIVRecurrenceKind(RK)) {
+ Value *StartV = getStartValueFromReductionResult(RdxResult);
+ ToFrozen[StartV] = cast<PHINode>(ResumeV)->getIncomingValueForBlock(
+ EPI.MainLoopIterationCountCheck);
+
+ // VPReductionPHIRecipe for FindFirstIV/FindLastIV reductions
+ // requires an adjustment to the resume value. The resume value is
+ // adjusted to the sentinel value when the final value from the main
+ // vector loop equals the start value. This ensures correctness when
+ // the start value might not be less than the minimum value of a
+ // monotonically increasing induction variable.
+ BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
+ IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
+ Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
+ Value *Sentinel = RdxResult->getOperand(2)->getLiveInIRValue();
+ ResumeV = Builder.CreateSelect(Cmp, Sentinel, ResumeV);
+ } else {
+ VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
+ auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
+ if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
+ assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
+ "unexpected start value");
+ VPI->setOperand(0, StartVal);
+ continue;
+ }
+ }
+ } else {
+ // Retrieve the induction resume values for wide inductions from
+ // their original phi nodes in the scalar loop.
+ PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
+ // Hook up to the PHINode generated by a ResumePhi recipe of main
+ // loop VPlan, which feeds the scalar loop.
+ ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
}
+ assert(ResumeV && "Must have a resume value");
+ VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
+ cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
}
- } else {
- // Retrieve the induction resume values for wide inductions from
- // their original phi nodes in the scalar loop.
- PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
- // Hook up to the PHINode generated by a ResumePhi recipe of main
- // loop VPlan, which feeds the scalar loop.
- ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
- }
- assert(ResumeV && "Must have a resume value");
- VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
- cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
- }
// For some VPValues in the epilogue plan we must re-use the generated IR
// values from the main plan. Replace them with live-in VPValues.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 18c2bef90f08f..fd5c3e4651beb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -383,6 +383,12 @@ m_c_Binary(const Op0_t &Op0, const Op1_t &Op1) {
return AllRecipe_commutative_match<Opcode, Op0_t, Op1_t>(Op0, Op1);
}
+template <typename Op0_t, typename Op1_t>
+inline AllRecipe_match<Instruction::Add, Op0_t, Op1_t> m_Add(const Op0_t &Op0,
+ const Op1_t &Op1) {
+ return m_Binary<Instruction::Add, Op0_t, Op1_t>(Op0, Op1);
+}
+
template <typename Op0_t, typename Op1_t>
inline AllRecipe_commutative_match<Instruction::Add, Op0_t, Op1_t>
m_c_Add(const Op0_t &Op0, const Op1_t &Op1) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8d376557553c0..4e00a39fd0858 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1207,6 +1207,17 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
if (!Plan->isUnrolled())
return;
+ if (match(Def, m_Add(m_VPValue(X), m_VPValue(Y))) && Y->isLiveIn() &&
+ isa<VPPhi>(X)) {
+ auto *Phi = cast<VPPhi>(X);
+ if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
+ Phi->getNumUsers() == 1 && (*Phi->user_begin() == &R)) {
+ Phi->setOperand(0, Y);
+ Def->replaceAllUsesWith(Phi);
+ return;
+ }
+ }
+
// VPVectorPointer for part 0 can be replaced by their start pointer.
if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) {
if (VecPtr->isFirstPart()) {
>From bc336d246b152fe48499f3dca2b4bfe84b6450f8 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 5 Jun 2025 22:17:46 +0100
Subject: [PATCH 3/3] [VPlan] Make canonical IV part of the region
---
.../Transforms/Vectorize/LoopVectorize.cpp | 210 +++++++++---------
llvm/lib/Transforms/Vectorize/VPlan.cpp | 82 +++++--
llvm/lib/Transforms/Vectorize/VPlan.h | 156 +++++--------
.../Transforms/Vectorize/VPlanAnalysis.cpp | 43 ++--
.../Vectorize/VPlanConstruction.cpp | 28 +--
.../Transforms/Vectorize/VPlanPatternMatch.h | 1 -
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 16 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 96 ++++----
llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 4 +-
llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 4 +-
llvm/lib/Transforms/Vectorize/VPlanValue.h | 25 ++-
.../Transforms/Vectorize/VPlanVerifier.cpp | 6 -
.../AArch64/conditional-branches-cost.ll | 14 +-
.../AArch64/divs-with-scalable-vfs.ll | 4 +-
.../AArch64/fully-unrolled-cost.ll | 4 +-
.../AArch64/induction-costs-sve.ll | 8 +-
.../AArch64/masked-call-scalarize.ll | 4 +-
.../LoopVectorize/AArch64/masked-call.ll | 2 +-
.../LoopVectorize/AArch64/optsize_minsize.ll | 8 +-
.../AArch64/partial-reduce-dot-product.ll | 6 +-
.../AArch64/reduction-recurrence-costs-sve.ll | 4 +-
.../AArch64/scalable-strict-fadd.ll | 14 +-
.../LoopVectorize/AArch64/store-costs-sve.ll | 2 +-
.../sve-interleaved-masked-accesses.ll | 8 +-
.../AArch64/sve-tail-folding-forced.ll | 2 +-
.../sve-tail-folding-overflow-checks.ll | 2 +-
.../AArch64/sve-tail-folding-reductions.ll | 12 +-
.../AArch64/sve-tail-folding-unroll.ll | 4 +-
.../LoopVectorize/AArch64/sve-tail-folding.ll | 20 +-
.../LoopVectorize/AArch64/sve2-histcnt.ll | 2 +-
.../AArch64/tail-fold-uniform-memops.ll | 4 +-
.../AArch64/tail-folding-styles.ll | 2 +-
.../AArch64/uniform-args-call-variants.ll | 10 +-
.../LoopVectorize/ARM/mve-icmpcost.ll | 15 +-
.../LoopVectorize/ARM/optsize_minsize.ll | 2 +-
.../X86/CostModel/vpinstruction-cost.ll | 2 -
.../LoopVectorize/iv-select-cmp-decreasing.ll | 4 +-
...eref-pred-poison-ub-ops-feeding-pointer.ll | 10 +-
.../LoopVectorize/vplan-printing.ll | 68 +++---
.../Transforms/Vectorize/VPlanHCFGTest.cpp | 5 +-
.../Vectorize/VPlanPatternMatchTest.cpp | 29 ++-
.../Vectorize/VPlanVerifierTest.cpp | 27 +--
42 files changed, 486 insertions(+), 483 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d09e54dc4212c..c4e24b3b19942 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4064,7 +4064,6 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPScalarIVStepsSC:
case VPDef::VPReplicateSC:
case VPDef::VPInstructionSC:
- case VPDef::VPCanonicalIVPHISC:
case VPDef::VPVectorPointerSC:
case VPDef::VPVectorEndPointerSC:
case VPDef::VPExpandSCEVSC:
@@ -8594,6 +8593,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
m_Specific(Plan->getCanonicalIV()), m_VPValue())) &&
"Did not find the canonical IV increment");
cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags();
+ Plan->getCanonicalIVInfo().HasNUW = false;
}
// ---------------------------------------------------------------------------
@@ -8657,8 +8657,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// latter are added above for masking.
// FIXME: Migrate code relying on the underlying instruction from VPlan0
// to construct recipes below to not use the underlying instruction.
- if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe, VPBlendRecipe>(
- &R) ||
+ if (isa<VPWidenCanonicalIVRecipe, VPBlendRecipe>(&R) ||
(isa<VPInstruction>(&R) && !UnderlyingValue))
continue;
@@ -8879,8 +8878,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
Builder, BlockMaskCache, nullptr /*LVer*/);
for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
- if (isa<VPCanonicalIVPHIRecipe>(&R))
- continue;
auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
}
@@ -9632,8 +9629,6 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
for (VPRecipeBase &R :
EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
- if (isa<VPCanonicalIVPHIRecipe>(&R))
- continue;
EpiWidenedPhis.insert(
cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
}
@@ -9685,7 +9680,7 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
VPValue *VectorTC = &MainPlan.getVectorTripCount();
// If there is a suitable resume value for the canonical induction in the
// scalar (which will become vector) epilogue loop, use it and move it to the
- // beginning of the scalar preheader. Otherwise create it below.
+ // beginning of the scalar prMeheader. Otherwise create it below.
auto ResumePhiIter =
find_if(MainScalarPH->phis(), [VectorTC](VPRecipeBase &R) {
return match(&R, m_VPInstruction<Instruction::PHI>(m_Specific(VectorTC),
@@ -9694,8 +9689,9 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
VPPhi *ResumePhi = nullptr;
if (ResumePhiIter == MainScalarPH->phis().end()) {
VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
+ Type *Ty = VPTypeAnalysis(MainPlan).inferScalarType(VectorTC);
ResumePhi = ScalarPHBuilder.createScalarPhi(
- {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
+ {VectorTC, MainPlan.getOrAddLiveIn(Constant::getNullValue(Ty))}, {},
"vec.epilog.resume.val");
} else {
ResumePhi = cast<VPPhi>(&*ResumePhiIter);
@@ -9719,13 +9715,14 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
Header->setName("vec.epilog.vector.body");
- auto *IV = cast<VPCanonicalIVPHIRecipe>(&*Header->begin());
- // When vectorizing the epilogue loop, the canonical induction start
- // value needs to be changed from zero to the value after the main
- // vector loop. Find the resume value created during execution of the main
- // VPlan. It must be the first phi in the loop preheader.
- // FIXME: Improve modeling for canonical IV start values in the epilogue
- // loop.
+ DenseMap<Value *, Value *> ToFrozen;
+ // Ensure that the start values for all header phi recipes are updated before
+ // vectorizing the epilogue loop.
+ auto *IV = Plan.getCanonicalIV();
+ // When vectorizing the epilogue loop, the canonical induction start value
+ // needs to be changed from zero to the value after the main vector loop. Find
+ // the resume value created during execution of the main VPlan.
+ // FIXME: Improve modeling for canonical IV start values in the epilogue loop.
using namespace llvm::PatternMatch;
PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
for (Value *Inc : EPResumeVal->incoming_values()) {
@@ -9735,100 +9732,97 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
"Must only have a single non-zero incoming value");
EPI.VectorTripCount = Inc;
}
- // If we didn't find a non-zero vector trip count, all incoming values
- // must be zero, which also means the vector trip count is zero. Pick the
- // first zero as vector trip count.
- // TODO: We should not choose VF * UF so the main vector loop is known to
- // be dead.
- if (!EPI.VectorTripCount) {
- assert(
- EPResumeVal->getNumIncomingValues() > 0 &&
- all_of(EPResumeVal->incoming_values(),
- [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
- "all incoming values must be 0");
- EPI.VectorTripCount = EPResumeVal->getOperand(0);
- }
- VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
- assert(all_of(IV->users(),
- [](const VPUser *U) {
- return isa<VPScalarIVStepsRecipe>(U) ||
- isa<VPDerivedIVRecipe>(U) ||
- cast<VPRecipeBase>(U)->isScalarCast() ||
- cast<VPInstruction>(U)->getOpcode() ==
- Instruction::Add;
- }) &&
- "the canonical IV should only be used by its increment or "
- "ScalarIVSteps when resetting the start value");
- VPBuilder Builder(Header, Header->getFirstNonPhi());
- VPInstruction *Add = Builder.createNaryOp(Instruction::Add, {IV, VPV});
- IV->replaceAllUsesWith(Add);
- Add->setOperand(0, IV);
-
- DenseMap<Value *, Value *> ToFrozen;
- // Ensure that the start values for all header phi recipes are updated
- // before vectorizing the epilogue loop.
- for (VPRecipeBase &R : drop_begin(Header->phis())) {
- Value *ResumeV = nullptr;
- // TODO: Move setting of resume values to prepareToExecute.
- if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
- auto *RdxResult = cast<VPInstruction>(
- *find_if(ReductionPhi->users(), [](VPUser *U) {
- auto *VPI = dyn_cast<VPInstruction>(U);
- return VPI &&
- (VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
- VPI->getOpcode() ==
- VPInstruction::ComputeReductionResult ||
- VPI->getOpcode() == VPInstruction::ComputeFindIVResult);
- }));
- ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
- ->getIncomingValueForBlock(L->getLoopPreheader());
- RecurKind RK = ReductionPhi->getRecurrenceKind();
- if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
- Value *StartV = RdxResult->getOperand(1)->getLiveInIRValue();
- // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
- // start value; compare the final value from the main vector loop
- // to the start value.
- BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
- IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
- ResumeV = Builder.CreateICmpNE(ResumeV, StartV);
- } else if (RecurrenceDescriptor::isFindIVRecurrenceKind(RK)) {
- Value *StartV = getStartValueFromReductionResult(RdxResult);
- ToFrozen[StartV] = cast<PHINode>(ResumeV)->getIncomingValueForBlock(
- EPI.MainLoopIterationCountCheck);
-
- // VPReductionPHIRecipe for FindFirstIV/FindLastIV reductions
- // requires an adjustment to the resume value. The resume value is
- // adjusted to the sentinel value when the final value from the main
- // vector loop equals the start value. This ensures correctness when
- // the start value might not be less than the minimum value of a
- // monotonically increasing induction variable.
- BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
- IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
- Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
- Value *Sentinel = RdxResult->getOperand(2)->getLiveInIRValue();
- ResumeV = Builder.CreateSelect(Cmp, Sentinel, ResumeV);
- } else {
- VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
- auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
- if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
- assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
- "unexpected start value");
- VPI->setOperand(0, StartVal);
- continue;
- }
- }
- } else {
- // Retrieve the induction resume values for wide inductions from
- // their original phi nodes in the scalar loop.
- PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
- // Hook up to the PHINode generated by a ResumePhi recipe of main
- // loop VPlan, which feeds the scalar loop.
- ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
- }
- assert(ResumeV && "Must have a resume value");
+ // If we didn't find a non-zero vector trip count, all incoming values
+ // must be zero, which also means the vector trip count is zero. Pick the
+ // first zero as vector trip count.
+ // TODO: We should not choose VF * UF so the main vector loop is known to
+ // be dead.
+ if (!EPI.VectorTripCount) {
+ assert(EPResumeVal->getNumIncomingValues() > 0 &&
+ all_of(EPResumeVal->incoming_values(),
+ [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
+ "all incoming values must be 0");
+ EPI.VectorTripCount = EPResumeVal->getOperand(0);
+ }
+ VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
+ assert(all_of(IV->users(),
+ [](const VPUser *U) {
+ return isa<VPScalarIVStepsRecipe>(U) ||
+ isa<VPDerivedIVRecipe>(U) ||
+ cast<VPRecipeBase>(U)->isScalarCast() ||
+ cast<VPInstruction>(U)->getOpcode() ==
+ Instruction::Add;
+ }) &&
+ "the canonical IV should only be used by its increment or "
+ "ScalarIVSteps when resetting the start value");
+ VPBuilder Builder(Header, Header->getFirstNonPhi());
+ VPInstruction *Add = Builder.createNaryOp(Instruction::Add, {IV, VPV});
+ IV->replaceAllUsesWith(Add);
+ Add->setOperand(0, IV);
+
+ // Ensure that the start values for all header phi recipes are updated before
+ // vectorizing the epilogue loop.
+ for (VPRecipeBase &R : Header->phis()) {
+ Value *ResumeV = nullptr;
+ // TODO: Move setting of resume values to prepareToExecute.
+ if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
+ auto *RdxResult =
+ cast<VPInstruction>(*find_if(ReductionPhi->users(), [](VPUser *U) {
+ auto *VPI = dyn_cast<VPInstruction>(U);
+ return VPI &&
+ (VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
+ VPI->getOpcode() == VPInstruction::ComputeReductionResult ||
+ VPI->getOpcode() == VPInstruction::ComputeFindIVResult);
+ }));
+ ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
+ ->getIncomingValueForBlock(L->getLoopPreheader());
+ RecurKind RK = ReductionPhi->getRecurrenceKind();
+ if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
+ Value *StartV = RdxResult->getOperand(1)->getLiveInIRValue();
+ // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
+ // start value; compare the final value from the main vector loop
+ // to the start value.
+ BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
+ IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
+ ResumeV = Builder.CreateICmpNE(ResumeV, StartV);
+ } else if (RecurrenceDescriptor::isFindIVRecurrenceKind(RK)) {
+ Value *StartV = getStartValueFromReductionResult(RdxResult);
+ ToFrozen[StartV] = cast<PHINode>(ResumeV)->getIncomingValueForBlock(
+ EPI.MainLoopIterationCountCheck);
+
+ // VPReductionPHIRecipe for FindFirstIV/FindLastIV reductions
+ // requires an adjustment to the resume value. The resume value is
+ // adjusted to the sentinel value when the final value from the main
+ // vector loop equals the start value. This ensures correctness when
+ // the start value might not be less than the minimum value of a
+ // monotonically increasing induction variable.
+ BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
+ IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
+ Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
+ Value *Sentinel = RdxResult->getOperand(2)->getLiveInIRValue();
+ ResumeV = Builder.CreateSelect(Cmp, Sentinel, ResumeV);
+ } else {
VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
- cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
+ auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
+ if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
+ assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
+ "unexpected start value");
+ VPI->setOperand(0, StartVal);
+ continue;
+ }
}
+ } else {
+ // Retrieve the induction resume values for wide inductions from
+ // their original phi nodes in the scalar loop.
+ PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
+ // Hook up to the PHINode generated by a ResumePhi recipe of main
+ // loop VPlan, which feeds the scalar loop.
+ ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
+ }
+ assert(ResumeV && "Must have a resume value");
+ VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
+ cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
+ }
// For some VPValues in the epilogue plan we must re-use the generated IR
// values from the main plan. Replace them with live-in VPValues.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1438dc366b55d..e3c42029ca6c3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -771,10 +771,16 @@ static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry) {
VPRegionBlock *VPRegionBlock::clone() {
const auto &[NewEntry, NewExiting] = cloneFrom(getEntry());
- auto *NewRegion = getPlan()->createVPRegionBlock(NewEntry, NewExiting,
- getName(), isReplicator());
+ auto *NewRegion =
+ getPlan()->createVPRegionBlock(NewEntry, NewExiting, getName());
for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry))
Block->setParent(NewRegion);
+
+ if (CanIV.CanIV) {
+ NewRegion->CanIV.CanIV = new VPRegionValue();
+ NewRegion->CanIV.HasNUW = CanIV.HasNUW;
+ NewRegion->CanIV.DL = CanIV.DL;
+ }
return NewRegion;
}
@@ -868,6 +874,11 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << (isReplicator() ? "<xVFxUF> " : "<x1> ") << getName() << ": {";
auto NewIndent = Indent + " ";
+ if (CanIV.CanIV) {
+ O << '\n';
+ CanIV.CanIV->print(O, SlotTracker);
+ O << '\n';
+ }
for (auto *BlockBase : vp_depth_first_shallow(Entry)) {
O << '\n';
BlockBase->print(O, NewIndent, SlotTracker);
@@ -880,18 +891,48 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
void VPRegionBlock::dissolveToCFGLoop() {
auto *Header = cast<VPBasicBlock>(getEntry());
- if (auto *CanIV = dyn_cast<VPCanonicalIVPHIRecipe>(&Header->front())) {
- assert(this == getPlan()->getVectorLoopRegion() &&
- "Canonical IV must be in the entry of the top-level loop region");
- auto *ScalarR = VPBuilder(CanIV).createScalarPhi(
- {CanIV->getStartValue(), CanIV->getBackedgeValue()},
- CanIV->getDebugLoc(), "index");
- CanIV->replaceAllUsesWith(ScalarR);
- CanIV->eraseFromParent();
+ auto *ExitingLatch = cast<VPBasicBlock>(getExiting());
+ if (CanIV.CanIV && CanIV.CanIV->getNumUsers() > 0) {
+ VPlan &Plan = *getPlan();
+ auto *ExitingTerm = ExitingLatch->getTerminator();
+ VPValue *IncCand;
+ VPInstruction *CanIVInc = nullptr;
+ if (match(ExitingTerm, m_BranchOnCount(m_VPValue(IncCand), m_VPValue()))) {
+ assert(!CanIVInc && "Expecting at most one canonical IV increment");
+ CanIVInc = cast<VPInstruction>(IncCand);
+ }
+ if (!CanIVInc) {
+ for (VPUser *U : CanIV.CanIV->users()) {
+ VPValue *Inc;
+ if (!match(U, m_VPInstruction<Instruction::Add>(m_Specific(CanIV.CanIV),
+ m_VPValue(Inc))))
+ continue;
+ auto *IncCand = cast<VPInstruction>(U);
+ if (Inc != &Plan.getVFxUF() &&
+ (IncCand->getNumUsers() != 1 ||
+ !match(*IncCand->user_begin(),
+ m_BranchOnCount(m_Specific(IncCand), m_VPValue()))))
+ continue;
+ assert(!CanIVInc && "Expecting at most one canonical IV increment");
+ CanIVInc = cast<VPInstruction>(U);
+ }
+ }
+ if (!CanIVInc) {
+ CanIVInc = VPBuilder(ExitingTerm)
+ .createOverflowingOp(
+ Instruction::Add, {CanIV.CanIV, &Plan.getVFxUF()},
+ {CanIV.HasNUW, false}, CanIV.DL, "index.next");
+ }
+ Type *CanIVTy = VPTypeAnalysis(Plan).inferScalarType(CanIV.CanIV);
+ auto *ScalarR =
+ VPBuilder(Header, Header->begin())
+ .createScalarPhi(
+ {Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, 0)), CanIVInc},
+ CanIV.DL, "index");
+ CanIV.CanIV->replaceAllUsesWith(ScalarR);
}
VPBlockBase *Preheader = getSinglePredecessor();
- auto *ExitingLatch = cast<VPBasicBlock>(getExiting());
VPBlockBase *Middle = getSingleSuccessor();
VPBlockUtils::disconnectBlocks(Preheader, this);
VPBlockUtils::disconnectBlocks(this, Middle);
@@ -928,7 +969,10 @@ VPlan::~VPlan() {
for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
R.setOperand(I, &DummyValue);
}
+ } else if (auto *CanIV = cast<VPRegionBlock>(VPB)->getCanonicalIV()) {
+ CanIV->replaceAllUsesWith(&DummyValue);
}
+
delete VPB;
}
for (VPValue *VPV : getLiveIns())
@@ -1212,6 +1256,11 @@ VPlan *VPlan::duplicate() {
// else NewTripCount will be created and inserted into Old2NewVPValues when
// TripCount is cloned. In any case NewPlan->TripCount is updated below.
+ if (auto *LoopRegion = getVectorLoopRegion()) {
+ Old2NewVPValues[LoopRegion->getCanonicalIV()] =
+ NewPlan->getVectorLoopRegion()->getCanonicalIV();
+ }
+
remapOperands(Entry, NewEntry, Old2NewVPValues);
// Initialize remaining fields of cloned VPlan.
@@ -1392,6 +1441,8 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
/// Returns true if there is a vector loop region and \p VPV is defined in a
/// loop region.
static bool isDefinedInsideLoopRegions(const VPValue *VPV) {
+ if (isa<VPRegionValue>(VPV))
+ return true;
const VPRecipeBase *DefR = VPV->getDefiningRecipe();
return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() ||
DefR->getParent()->getEnclosingLoopRegion());
@@ -1501,9 +1552,12 @@ void VPSlotTracker::assignNames(const VPlan &Plan) {
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<const VPBlockBase *>>
RPOT(VPBlockDeepTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));
- for (const VPBasicBlock *VPBB :
- VPBlockUtils::blocksOnly<const VPBasicBlock>(RPOT))
- assignNames(VPBB);
+ for (const VPBlockBase *VPB : RPOT) {
+ if (auto *VPBB = dyn_cast<VPBasicBlock>(VPB)) {
+ assignNames(VPBB);
+ } else if (auto *CanIV = cast<VPRegionBlock>(VPB)->getCanonicalIV())
+ assignName(CanIV);
+ }
}
void VPSlotTracker::assignNames(const VPBasicBlock *VPBB) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 055db2b9adb95..fae8c8b87d074 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -547,7 +547,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenSelectSC:
case VPRecipeBase::VPBlendSC:
case VPRecipeBase::VPPredInstPHISC:
- case VPRecipeBase::VPCanonicalIVPHISC:
case VPRecipeBase::VPActiveLaneMaskPHISC:
case VPRecipeBase::VPFirstOrderRecurrencePHISC:
case VPRecipeBase::VPWidenPHISC:
@@ -1930,12 +1929,6 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
/// the backedge is the second operand.
///
/// Inductions are modeled using the following sub-classes:
-/// * VPCanonicalIVPHIRecipe: Canonical scalar induction of the vector loop,
-/// starting at a specified value (zero for the main vector loop, the resume
-/// value for the epilogue vector loop) and stepping by 1. The induction
-/// controls exiting of the vector loop by comparing against the vector trip
-/// count. Produces a single scalar PHI for the induction value per
-/// iteration.
/// * VPWidenIntOrFpInductionRecipe: Generates vector values for integer and
/// floating point inductions with arbitrary start and step values. Produces
/// a vector PHI per-part.
@@ -3288,63 +3281,6 @@ class VPExpandSCEVRecipe : public VPSingleDefRecipe {
const SCEV *getSCEV() const { return Expr; }
};
-/// Canonical scalar induction phi of the vector loop. Starting at the specified
-/// start value (either 0 or the resume value when vectorizing the epilogue
-/// loop). VPWidenCanonicalIVRecipe represents the vector version of the
-/// canonical induction variable.
-class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
-public:
- VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL)
- : VPHeaderPHIRecipe(VPDef::VPCanonicalIVPHISC, nullptr, StartV, DL) {}
-
- ~VPCanonicalIVPHIRecipe() override = default;
-
- VPCanonicalIVPHIRecipe *clone() override {
- auto *R = new VPCanonicalIVPHIRecipe(getOperand(0), getDebugLoc());
- R->addOperand(getBackedgeValue());
- return R;
- }
-
- VP_CLASSOF_IMPL(VPDef::VPCanonicalIVPHISC)
-
- void execute(VPTransformState &State) override {
- llvm_unreachable("cannot execute this recipe, should be replaced by a "
- "scalar phi recipe");
- }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-#endif
-
- /// Returns the scalar type of the induction.
- Type *getScalarType() const {
- return getStartValue()->getLiveInIRValue()->getType();
- }
-
- /// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
- assert(is_contained(operands(), Op) &&
- "Op must be an operand of the recipe");
- return true;
- }
-
- /// Returns true if the recipe only uses the first part of operand \p Op.
- bool onlyFirstPartUsed(const VPValue *Op) const override {
- assert(is_contained(operands(), Op) &&
- "Op must be an operand of the recipe");
- return true;
- }
-
- /// Return the cost of this VPCanonicalIVPHIRecipe.
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override {
- // For now, match the behavior of the legacy cost model.
- return 0;
- }
-};
-
/// A recipe for generating the active lane mask for the vector loop that is
/// used to predicate the vector operations.
/// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
@@ -3423,14 +3359,13 @@ class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe {
class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe,
public VPUnrollPartAccessor<1> {
public:
- VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV)
+ VPWidenCanonicalIVRecipe(VPValue *CanonicalIV)
: VPSingleDefRecipe(VPDef::VPWidenCanonicalIVSC, {CanonicalIV}) {}
~VPWidenCanonicalIVRecipe() override = default;
VPWidenCanonicalIVRecipe *clone() override {
- return new VPWidenCanonicalIVRecipe(
- cast<VPCanonicalIVPHIRecipe>(getOperand(0)));
+ return new VPWidenCanonicalIVRecipe(getOperand(0));
}
VP_CLASSOF_IMPL(VPDef::VPWidenCanonicalIVSC)
@@ -3469,8 +3404,7 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
public:
VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start,
- VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step,
- const Twine &Name = "")
+ VPValue *CanonicalIV, VPValue *Step, const Twine &Name = "")
: VPDerivedIVRecipe(
IndDesc.getKind(),
dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()),
@@ -3816,6 +3750,17 @@ class VPIRBasicBlock : public VPBasicBlock {
BasicBlock *getIRBasicBlock() const { return IRBB; }
};
+struct VPCanonicalIVS {
+ VPValue *CanIV = nullptr;
+ bool HasNUW = true;
+ DebugLoc DL = DebugLoc::getUnknown();
+
+ VPCanonicalIVS(VPValue *CanIV, bool HasNUW, DebugLoc DL)
+ : CanIV(CanIV), HasNUW(HasNUW), DL(DL) {}
+
+ VPCanonicalIVS() {}
+};
+
/// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
/// which form a Single-Entry-Single-Exiting subgraph of the output IR CFG.
/// A VPRegionBlock may indicate that its contents are to be replicated several
@@ -3834,26 +3779,40 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
/// VPRegionBlock.
VPBlockBase *Exiting;
- /// An indicator whether this region is to generate multiple replicated
- /// instances of output IR corresponding to its VPBlockBases.
- bool IsReplicator;
+ /// Canonical IV of the loop region. If nullptr, the region is a replication
+ /// region.
+ VPCanonicalIVS CanIV;
/// Use VPlan::createVPRegionBlock to create VPRegionBlocks.
VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
- const std::string &Name = "", bool IsReplicator = false)
+ const std::string &Name = "")
+ : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting),
+ CanIV() {
+ assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
+ assert(Exiting->getSuccessors().empty() && "Exit block has successors.");
+ Entry->setParent(this);
+ Exiting->setParent(this);
+ }
+
+ VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, VPCanonicalIVS CanIV,
+ const std::string &Name = "")
: VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting),
- IsReplicator(IsReplicator) {
+ CanIV(CanIV) {
assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
assert(Exiting->getSuccessors().empty() && "Exit block has successors.");
Entry->setParent(this);
Exiting->setParent(this);
}
- VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
+
+ VPRegionBlock(DebugLoc DL, const std::string &Name = "")
: VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
- IsReplicator(IsReplicator) {}
+ CanIV(new VPRegionValue(), true, DL) {}
public:
- ~VPRegionBlock() override {}
+ ~VPRegionBlock() override {
+ if (getCanonicalIV())
+ delete CanIV.CanIV;
+ }
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPBlockBase *V) {
@@ -3892,7 +3851,7 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
/// An indicator whether this region is to generate multiple replicated
/// instances of output IR corresponding to its VPBlockBases.
- bool isReplicator() const { return IsReplicator; }
+ bool isReplicator() const { return !getCanonicalIV(); }
/// The method which generates the output IR instructions that correspond to
/// this VPRegionBlock, thereby "executing" the VPlan.
@@ -3920,6 +3879,13 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
/// Remove the current region from its VPlan, connecting its predecessor to
/// its entry, and its exiting block to its successor.
void dissolveToCFGLoop();
+
+ /// Return the canonical induction variable of the region, null for
+ /// replicating regions.
+ VPValue *getCanonicalIV() { return CanIV.CanIV; }
+ const VPValue *getCanonicalIV() const { return CanIV.CanIV; }
+
+ VPCanonicalIVS &getCanonicalIVInfo() { return CanIV; }
};
/// VPlan models a candidate for vectorization, encoding various decisions take
@@ -4231,14 +4197,10 @@ class VPlan {
LLVM_DUMP_METHOD void dump() const;
#endif
- /// Returns the canonical induction recipe of the vector loop.
- VPCanonicalIVPHIRecipe *getCanonicalIV() {
- VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock();
- if (EntryVPBB->empty()) {
- // VPlan native path.
- EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor());
- }
- return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
+ /// Returns the canonical induction VPValue of the vector loop.
+ VPValue *getCanonicalIV() { return getVectorLoopRegion()->getCanonicalIV(); }
+ VPCanonicalIVS &getCanonicalIVInfo() {
+ return getVectorLoopRegion()->getCanonicalIVInfo();
}
VPValue *getSCEVExpansion(const SCEV *S) const {
@@ -4264,22 +4226,22 @@ class VPlan {
return VPB;
}
- /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p
- /// IsReplicator is true, the region is a replicate region. The returned block
- /// is owned by the VPlan and deleted once the VPlan is destroyed.
+ /// Create a new replicate VPRegionBlock with \p Entry, \p Exiting and \p
+ /// Name. The returned block is owned by the VPlan and deleted once the VPlan
+ /// is destroyed.
VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
- const std::string &Name = "",
- bool IsReplicator = false) {
- auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator);
+ const std::string &Name = "") {
+ auto *VPB = new VPRegionBlock(Entry, Exiting, Name);
CreatedBlocks.push_back(VPB);
return VPB;
}
- /// Create a new loop VPRegionBlock with \p Name and entry and exiting blocks set
- /// to nullptr. The returned block is owned by the VPlan and deleted once the
- /// VPlan is destroyed.
- VPRegionBlock *createVPRegionBlock(const std::string &Name = "") {
- auto *VPB = new VPRegionBlock(Name);
+ /// Create a new loop VPRegionBlock with \p StartV and \p Name, and entry and
+ /// exiting blocks set to nullptr. The returned block is owned by the VPlan
+ /// and deleted once the VPlan is destroyed.
+ VPRegionBlock *createVPRegionBlock(DebugLoc DL,
+ const std::string &Name = "") {
+ auto *VPB = new VPRegionBlock(DL, Name);
CreatedBlocks.push_back(VPB);
return VPB;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 8e5997fa8befe..73b89e3251557 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -23,14 +23,6 @@ using namespace llvm;
#define DEBUG_TYPE "vplan"
VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan) : Ctx(Plan.getContext()) {
- if (auto LoopRegion = Plan.getVectorLoopRegion()) {
- if (const auto *CanIV = dyn_cast<VPCanonicalIVPHIRecipe>(
- &LoopRegion->getEntryBasicBlock()->front())) {
- CanonicalIVTy = CanIV->getScalarType();
- return;
- }
- }
-
// If there's no canonical IV, retrieve the type from the trip count
// expression.
auto *TC = Plan.getTripCount();
@@ -270,18 +262,20 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
return CanonicalIVTy;
}
+ if (auto *CanIV = dyn_cast<VPRegionValue>(V))
+ return CanonicalIVTy;
+
Type *ResultTy =
TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
- .Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe,
- VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe,
- VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe>(
- [this](const auto *R) {
- // Handle header phi recipes, except VPWidenIntOrFpInduction
- // which needs special handling due it being possibly truncated.
- // TODO: consider inferring/caching type of siblings, e.g.,
- // backedge value, here and in cases below.
- return inferScalarType(R->getStartValue());
- })
+ .Case<VPActiveLaneMaskPHIRecipe, VPFirstOrderRecurrencePHIRecipe,
+ VPReductionPHIRecipe, VPWidenPointerInductionRecipe,
+ VPEVLBasedIVPHIRecipe>([this](const auto *R) {
+ // Handle header phi recipes, except VPWidenIntOrFpInduction
+ // which needs special handling due it being possibly truncated.
+ // TODO: consider inferring/caching type of siblings, e.g.,
+ // backedge value, here and in cases below.
+ return inferScalarType(R->getStartValue());
+ })
.Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
[](const auto *R) { return R->getScalarType(); })
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
@@ -460,12 +454,13 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
// FIXME: Might need some motivation why these values are ignored. If
// for example an argument is used inside the loop it will increase the
// register pressure (so shouldn't we add it to LoopInvariants).
- if (!DefR && (!U->getLiveInIRValue() ||
- !isa<Instruction>(U->getLiveInIRValue())))
+ if (!isa<VPRegionValue>(U) && !DefR &&
+ (!U->getLiveInIRValue() ||
+ !isa<Instruction>(U->getLiveInIRValue())))
continue;
// If this recipe is outside the loop then record it and continue.
- if (!DefR) {
+ if (!DefR && !isa<VPRegionValue>(U)) {
LoopInvariants.insert(U);
continue;
}
@@ -513,6 +508,10 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
};
+ if (auto *CanIV = LoopRegion->getCanonicalIV())
+ if (CanIV->getNumUsers() != 0)
+ OpenIntervals.insert(CanIV);
+
// We scan the instructions linearly and record each time that a new interval
// starts, by placing it in a set. If we find this value in TransposEnds then
// we remove it from the set. The max register usage is the maximum register
@@ -558,7 +557,7 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
continue;
if (VFs[J].isScalar() ||
- isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
+ isa<VPRegionValue, VPReplicateRecipe, VPDerivedIVRecipe,
VPEVLBasedIVPHIRecipe, VPScalarIVStepsRecipe>(VPV) ||
(isa<VPInstruction>(VPV) && vputils::onlyScalarValuesUsed(VPV)) ||
(isa<VPReductionPHIRecipe>(VPV) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index c1c525ee23122..6be031b0d39a0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -392,7 +392,7 @@ static bool canonicalHeaderAndLatch(VPBlockBase *HeaderVPB,
/// Create a new VPRegionBlock for the loop starting at \p HeaderVPB.
static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) {
auto *PreheaderVPBB = HeaderVPB->getPredecessors()[0];
- auto *LatchVPBB = HeaderVPB->getPredecessors()[1];
+ auto *LatchVPBB = cast<VPBasicBlock>(HeaderVPB->getPredecessors()[1]);
VPBlockUtils::disconnectBlocks(PreheaderVPBB, HeaderVPB);
VPBlockUtils::disconnectBlocks(LatchVPBB, HeaderVPB);
@@ -403,13 +403,22 @@ static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) {
// LatchExitVPB, taking care to preserve the original predecessor & successor
// order of blocks. Set region entry and exiting after both HeaderVPB and
// LatchVPBB have been disconnected from their predecessors/successors.
- auto *R = Plan.createVPRegionBlock();
+ VPPhi *ScalarCanIV = nullptr;
+ if (PreheaderVPBB->getSinglePredecessor() == Plan.getEntry())
+ ScalarCanIV = cast<VPPhi>(&*cast<VPBasicBlock>(HeaderVPB)->begin());
+ auto *R =
+ Plan.createVPRegionBlock(ScalarCanIV ? ScalarCanIV->getDebugLoc()
+ : DebugLoc::getCompilerGenerated());
VPBlockUtils::insertOnEdge(LatchVPBB, LatchExitVPB, R);
VPBlockUtils::disconnectBlocks(LatchVPBB, R);
VPBlockUtils::connectBlocks(PreheaderVPBB, R);
R->setEntry(HeaderVPB);
R->setExiting(LatchVPBB);
+ if (ScalarCanIV) {
+ ScalarCanIV->replaceAllUsesWith(R->getCanonicalIV());
+ ScalarCanIV->eraseFromParent();
+ }
// All VPBB's reachable shallowly from HeaderVPB belong to the current region.
for (VPBlockBase *VPBB : vp_depth_first_shallow(HeaderVPB))
VPBB->setParent(R);
@@ -422,9 +431,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB,
DebugLoc DL) {
Value *StartIdx = ConstantInt::get(IdxTy, 0);
auto *StartV = Plan.getOrAddLiveIn(StartIdx);
-
- // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
- auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
+ auto *CanonicalIVPHI = new VPPhi(StartV, DL);
HeaderVPBB->insert(CanonicalIVPHI, HeaderVPBB->begin());
// We are about to replace the branch to exit the region. Remove the original
@@ -437,14 +444,9 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB,
}
VPBuilder Builder(LatchVPBB);
- // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
- // Initially the induction increment is guaranteed to not wrap, but that may
- // change later, e.g. when tail-folding, when the flags need to be dropped.
- auto *CanonicalIVIncrement = Builder.createOverflowingOp(
+ auto CanonicalIVIncrement = Builder.createOverflowingOp(
Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {true, false}, DL,
"index.next");
- CanonicalIVPHI->addOperand(CanonicalIVIncrement);
-
// Add the BranchOnCount VPInstruction to the latch.
Builder.createNaryOp(VPInstruction::BranchOnCount,
{CanonicalIVIncrement, &Plan.getVectorTripCount()},
@@ -660,7 +662,7 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
VPIRMetadata VPBranchWeights;
auto *Term = VPBuilder(CheckBlockVPBB)
.createNaryOp(VPInstruction::BranchOnCond, {CondVPV},
- Plan.getCanonicalIV()->getDebugLoc());
+ Plan.getCanonicalIVInfo().DL);
if (AddBranchWeights) {
MDBuilder MDB(Plan.getContext());
MDNode *BranchWeights =
@@ -789,7 +791,7 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
VPReductionPHIRecipe *RedPhiR = nullptr;
bool HasUnsupportedPhi = false;
for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
- if (isa<VPCanonicalIVPHIRecipe, VPWidenIntOrFpInductionRecipe>(&R))
+ if (isa<VPWidenIntOrFpInductionRecipe>(&R))
continue;
auto *Cur = dyn_cast<VPReductionPHIRecipe>(&R);
if (!Cur) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index fd5c3e4651beb..90199d7a246fd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -253,7 +253,6 @@ struct Recipe_match {
auto *DefR = dyn_cast<RecipeTy>(R);
// Check for recipes that do not have opcodes.
if constexpr (std::is_same<RecipeTy, VPScalarIVStepsRecipe>::value ||
- std::is_same<RecipeTy, VPCanonicalIVPHIRecipe>::value ||
std::is_same<RecipeTy, VPDerivedIVRecipe>::value ||
std::is_same<RecipeTy, VPWidenGEPRecipe>::value)
return DefR;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bd9a93ed57b8a..da2f5dba02448 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -66,7 +66,6 @@ bool VPRecipeBase::mayWriteToMemory() const {
->onlyReadsMemory();
case VPWidenIntrinsicSC:
return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
- case VPCanonicalIVPHISC:
case VPBranchOnMaskSC:
case VPFirstOrderRecurrencePHISC:
case VPReductionPHISC:
@@ -2309,9 +2308,10 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
return false;
auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
- auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
+ auto *CanIV = getParent()->getParent()->getCanonicalIV();
return StartC && StartC->isZero() && StepC && StepC->isOne() &&
- getScalarType() == CanIV->getScalarType();
+ getScalarType() ==
+ VPTypeAnalysis(*getParent()->getPlan()).inferScalarType(CanIV);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3877,16 +3877,6 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
0);
}
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << Indent << "EMIT ";
- printAsOperand(O, SlotTracker);
- O << " = CANONICAL-INDUCTION ";
- printOperands(O, SlotTracker);
-}
-#endif
-
bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
return IsScalarAfterVectorization &&
(!IsScalable || vputils::onlyFirstLaneUsed(this));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 4e00a39fd0858..abc28f2dffdc9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -367,8 +367,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
PredRecipe->eraseFromParent();
auto *Exiting =
Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
- VPRegionBlock *Region =
- Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
+ VPRegionBlock *Region = Plan.createVPRegionBlock(Entry, Exiting, RegionName);
// Note: first set Entry as region entry and then connect successors starting
// from it in order, to propagate the "parent" of each VPBasicBlock.
@@ -496,7 +495,7 @@ static void removeRedundantInductionCasts(VPlan &Plan) {
/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
/// recipe, if it exists.
static void removeRedundantCanonicalIVs(VPlan &Plan) {
- VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+ VPValue *CanonicalIV = Plan.getCanonicalIV();
VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
for (VPUser *U : CanonicalIV->users()) {
WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U);
@@ -578,7 +577,7 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
VPValue *StartV, VPValue *Step, DebugLoc DL,
VPBuilder &Builder) {
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
- VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+ VPValue *CanonicalIV = Plan.getCanonicalIV();
VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
@@ -796,7 +795,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
// Calculate the final index.
VPValue *EndValue = Plan.getCanonicalIV();
- auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
+ auto CanonicalIVType = TypeInfo.inferScalarType(EndValue);
VPBuilder B(cast<VPBasicBlock>(PredVPBB));
DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
@@ -1465,9 +1464,11 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
});
auto *CanIV = Plan.getCanonicalIV();
- if (!match(Cond, m_SpecificICmp(CmpInst::ICMP_EQ,
- m_Specific(CanIV->getBackedgeValue()),
- m_Specific(&Plan.getVectorTripCount()))))
+ if (!match(Cond, m_Binary<Instruction::ICmp>(
+ m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),
+ m_Specific(&Plan.getVectorTripCount()))) ||
+ cast<VPRecipeWithIRFlags>(Cond->getDefiningRecipe())->getPredicate() !=
+ CmpInst::ICMP_EQ)
return false;
// The compare checks CanIV + VFxUF == vector trip count. The vector trip
@@ -1529,8 +1530,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
return R->isCanonical();
- return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
- VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
+ return isa<VPEVLBasedIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe,
+ VPPhi>(&Phi);
})) {
for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
@@ -1545,6 +1546,9 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
HeaderR.eraseFromParent();
}
+ Plan.getCanonicalIV()->replaceAllUsesWith(
+ Plan.getOrAddLiveIn(ConstantInt::getNullValue(
+ VPTypeAnalysis(Plan).inferScalarType(Plan.getCanonicalIV()))));
VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
@@ -2022,15 +2026,18 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
- auto *CanonicalIVPHI = Plan.getCanonicalIV();
- VPValue *StartV = CanonicalIVPHI->getStartValue();
+ VPValue *CanonicalIV = Plan.getCanonicalIV();
+ VPValue *StartV = Plan.getOrAddLiveIn(Constant::getNullValue(
+ VPTypeAnalysis(Plan).inferScalarType(CanonicalIV)));
auto *CanonicalIVIncrement =
- cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
+ cast<VPInstruction>(EB->getTerminator()->getOperand(0));
// TODO: Check if dropping the flags is needed if
// !DataAndControlFlowWithoutRuntimeCheck.
CanonicalIVIncrement->dropPoisonGeneratingFlags();
- DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
+ auto &CanIVInfo = Plan.getCanonicalIVInfo();
+ CanIVInfo.HasNUW = false;
+ DebugLoc DL = CanIVInfo.DL;
// We can't use StartV directly in the ActiveLaneMask VPInstruction, since
// we have to take unrolling into account. Each part needs to start at
// Part * VF
@@ -2051,7 +2058,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
// When avoiding a runtime check, the active.lane.mask inside the loop
// uses a modified trip count and the induction variable increment is
// done after the active.lane.mask intrinsic is called.
- IncrementValue = CanonicalIVPHI;
+ IncrementValue = CanonicalIV;
TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
{TC}, DL);
}
@@ -2067,7 +2074,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
// Now create the ActiveLaneMaskPhi recipe in the main loop using the
// preheader ActiveLaneMask instruction.
auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
- LaneMaskPhi->insertAfter(CanonicalIVPHI);
+ auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
+ LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());
// Create the active lane mask for the next iteration of the loop before the
// original terminator.
@@ -2372,7 +2380,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
/// replaces all uses except the canonical IV increment of
-/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
+/// VPCanonicalIV with a VPEVLBasedIVPHIRecipe. VPCanonicalIV
/// is used only for loop iterations counting after this transformation.
///
/// The function uses the following definitions:
@@ -2417,13 +2425,18 @@ void VPlanTransforms::addExplicitVectorLength(
VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
- auto *CanonicalIVPHI = Plan.getCanonicalIV();
- auto *CanIVTy = CanonicalIVPHI->getScalarType();
- VPValue *StartV = CanonicalIVPHI->getStartValue();
+ auto *CanonicalIV = Plan.getCanonicalIV();
+ auto &CanIVInfo = Plan.getCanonicalIVInfo();
+ auto *CanIVTy = VPTypeAnalysis(Plan).inferScalarType(CanonicalIV);
+ VPValue *StartV = Plan.getOrAddLiveIn(ConstantInt::getNullValue(CanIVTy));
+ auto *CanonicalIVIncrement = cast<VPInstruction>(Plan.getVectorLoopRegion()
+ ->getExitingBasicBlock()
+ ->getTerminator()
+ ->getOperand(0));
// Create the ExplicitVectorLengthPhi recipe in the main loop.
auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
- EVLPhi->insertAfter(CanonicalIVPHI);
+ EVLPhi->insertBefore(*Header, Header->begin());
VPBuilder Builder(Header, Header->getFirstNonPhi());
// Create the AVL (application vector length), starting from TC -> 0 in steps
// of EVL.
@@ -2441,8 +2454,6 @@ void VPlanTransforms::addExplicitVectorLength(
auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
DebugLoc());
- auto *CanonicalIVIncrement =
- cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
Builder.setInsertPoint(CanonicalIVIncrement);
VPValue *OpVPEVL = VPEVL;
@@ -2451,9 +2462,7 @@ void VPlanTransforms::addExplicitVectorLength(
OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
auto *NextEVLIV = Builder.createOverflowingOp(
- Instruction::Add, {OpVPEVL, EVLPhi},
- {CanonicalIVIncrement->hasNoUnsignedWrap(),
- CanonicalIVIncrement->hasNoSignedWrap()},
+ Instruction::Add, {OpVPEVL, EVLPhi}, {CanIVInfo.HasNUW, false},
CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
EVLPhi->addOperand(NextEVLIV);
@@ -2464,10 +2473,10 @@ void VPlanTransforms::addExplicitVectorLength(
transformRecipestoEVLRecipes(Plan, *VPEVL);
- // Replace all uses of VPCanonicalIVPHIRecipe by
+ // Replace all uses of VPCanonicalIV by
// VPEVLBasedIVPHIRecipe except for the canonical IV increment.
- CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
- CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
+ CanonicalIV->replaceAllUsesWith(EVLPhi);
+ CanonicalIVIncrement->setOperand(0, CanonicalIV);
// TODO: support unroll factor > 1.
Plan.setUF(1);
}
@@ -2518,15 +2527,15 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
// Replace CanonicalIVInc with EVL-PHI increment.
auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
VPValue *Backedge = CanonicalIV->getIncomingValue(1);
- assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
- m_Specific(&Plan.getVFxUF()))) &&
- "Unexpected canonical iv");
- Backedge->replaceAllUsesWith(EVLIncrement);
+ if (match(Backedge,
+ m_c_Add(m_Specific(CanonicalIV), m_Specific(&Plan.getVFxUF())))) {
+ Backedge->replaceAllUsesWith(EVLIncrement);
- // Remove unused phi and increment.
- VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
- CanonicalIVIncrement->eraseFromParent();
- CanonicalIV->eraseFromParent();
+ // Remove unused phi and increment.
+ VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
+ CanonicalIVIncrement->eraseFromParent();
+ CanonicalIV->eraseFromParent();
+ }
// Replace the use of VectorTripCount in the latch-exiting block.
// Before: (branch-on-count EVLIVInc, VectorTripCount)
@@ -3634,8 +3643,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
unsigned VFMinVal = VF.getKnownMinValue();
SmallVector<VPInterleaveRecipe *> StoreGroups;
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
- if (isa<VPCanonicalIVPHIRecipe>(&R) ||
- match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+ if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
continue;
if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) &&
@@ -3785,21 +3793,23 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// Adjust induction to reflect that the transformed plan only processes one
// original iteration.
auto *CanIV = Plan.getCanonicalIV();
- auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
+ Type *CanIVTy = TypeInfo.inferScalarType(CanIV);
+ auto *Inc = cast<VPInstruction>(
+ VectorLoop->getExitingBasicBlock()->getTerminator()->getOperand(0));
VPBuilder PHBuilder(Plan.getVectorPreheader());
VPValue *UF = Plan.getOrAddLiveIn(
- ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
+ ConstantInt::get(CanIVTy, 1 * Plan.getUF()));
if (VF.isScalable()) {
VPValue *VScale = PHBuilder.createElementCount(
- CanIV->getScalarType(), ElementCount::getScalable(1));
+ CanIVTy, ElementCount::getScalable(1));
VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
Inc->setOperand(1, VScaleUF);
Plan.getVF().replaceAllUsesWith(VScale);
} else {
Inc->setOperand(1, UF);
Plan.getVF().replaceAllUsesWith(
- Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+ Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, 1)));
}
removeDeadRecipes(Plan);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 7a63d20825a31..3ac45f327507d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -69,7 +69,7 @@ class UnrollState {
VPBasicBlock::iterator InsertPtForPhi);
VPValue *getConstantVPV(unsigned Part) {
- Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
+ Type *CanIVIntTy = TypeInfo.inferScalarType(Plan.getCanonicalIV());
return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
}
@@ -79,7 +79,7 @@ class UnrollState {
void unrollBlock(VPBlockBase *VPB);
VPValue *getValueForPart(VPValue *V, unsigned Part) {
- if (Part == 0 || V->isLiveIn())
+ if (Part == 0 || V->isLiveIn() || isa<VPRegionValue>(V))
return V;
assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
"accessed value does not exist");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 700a733bf9f2c..6e83ba03ff35e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -105,9 +105,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
return all_of(R->operands(), isUniformAcrossVFsAndUFs);
}
- auto *CanonicalIV = R->getParent()->getPlan()->getCanonicalIV();
- // Canonical IV chain is uniform.
- if (V == CanonicalIV || V == CanonicalIV->getBackedgeValue())
+ if (isa<VPRegionValue>(V))
return true;
return TypeSwitch<const VPRecipeBase *, bool>(R)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 24f6d61512ef6..7bb739dbc3261 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -40,6 +40,7 @@ class VPUser;
class VPRecipeBase;
class VPInterleaveRecipe;
class VPPhiAccessors;
+class VPRegionValue;
// This is the base class of the VPlan Def/Use graph, used for modeling the data
// flow into, within and out of the VPlan. VPValues can stand for live-ins
@@ -51,6 +52,7 @@ class LLVM_ABI_FOR_TEST VPValue {
friend class VPInterleaveRecipe;
friend class VPlan;
friend class VPExpressionRecipe;
+ friend class VPRegionValue;
const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
@@ -89,7 +91,9 @@ class LLVM_ABI_FOR_TEST VPValue {
enum {
VPValueSC, /// A generic VPValue, like live-in values or defined by a recipe
/// that defines multiple values.
- VPVRecipeSC /// A VPValue sub-class that is a VPRecipeBase.
+ VPVRecipeSC, /// A VPValue sub-class that is a VPRecipeBase.
+ VPRegionValueSC, /// A VPValue sub-class defines the canonical IV of a loop
+ /// region.
};
VPValue(const VPValue &) = delete;
@@ -166,7 +170,9 @@ class LLVM_ABI_FOR_TEST VPValue {
bool hasDefiningRecipe() const { return getDefiningRecipe(); }
/// Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
- bool isLiveIn() const { return !hasDefiningRecipe(); }
+ bool isLiveIn() const {
+ return !hasDefiningRecipe() && SubclassID != VPRegionValueSC;
+ }
/// Returns the underlying IR value, if this VPValue is defined outside the
/// scope of VPlan. Returns nullptr if the VPValue is defined by a VPDef
@@ -187,6 +193,18 @@ class LLVM_ABI_FOR_TEST VPValue {
}
};
+/// VPValues defined by a VPRegionBlock, like the canonical IV.
+class VPRegionValue : public VPValue {
+public:
+ VPRegionValue() : VPValue(VPValue::VPRegionValueSC) {}
+
+ ~VPRegionValue() override = default;
+
+ static inline bool classof(const VPValue *V) {
+ return V->getVPValueID() == VPValue::VPRegionValueSC;
+ }
+};
+
typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
typedef DenseMap<VPValue *, Value *> VPValue2ValueTy;
@@ -361,7 +379,6 @@ class VPDef {
VPPredInstPHISC,
// START: SubclassID for recipes that inherit VPHeaderPHIRecipe.
// VPHeaderPHIRecipe need to be kept together.
- VPCanonicalIVPHISC,
VPActiveLaneMaskPHISC,
VPEVLBasedIVPHISC,
VPFirstOrderRecurrencePHISC,
@@ -371,7 +388,7 @@ class VPDef {
// END: SubclassID for recipes that inherit VPHeaderPHIRecipe
// END: Phi-like recipes
VPFirstPHISC = VPWidenPHISC,
- VPFirstHeaderPHISC = VPCanonicalIVPHISC,
+ VPFirstHeaderPHISC = VPActiveLaneMaskPHISC,
VPLastHeaderPHISC = VPReductionPHISC,
VPLastPHISC = VPReductionPHISC,
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index e25ffe135418e..dc0696a5ce844 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -458,12 +458,6 @@ bool VPlanVerifier::verify(const VPlan &Plan) {
return false;
}
- if (!isa<VPCanonicalIVPHIRecipe>(&*Entry->begin())) {
- errs() << "VPlan vector loop header does not start with a "
- "VPCanonicalIVPHIRecipe\n";
- return false;
- }
-
const VPBasicBlock *Exiting = dyn_cast<VPBasicBlock>(TopRegion->getExiting());
if (!Exiting) {
errs() << "VPlan exiting block is not a VPBasicBlock\n";
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index aa3d81a19a6d2..4710fa197b871 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -579,10 +579,10 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; PRED-NEXT: [[TMP13:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
; PRED-NEXT: [[TMP14:%.*]] = uitofp <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x double>
; PRED-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[NEXT_GEP]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
; PRED-NEXT: [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; PRED-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; PRED-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
; PRED-NEXT: br [[EXIT:label %.*]]
@@ -658,15 +658,15 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]]
; COMMON: [[PRED_STORE_CONTINUE12]]:
-; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
; COMMON: [[PRED_STORE_IF13]]:
; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1
-; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE14]]
-; COMMON: [[PRED_STORE_CONTINUE14]]:
+; COMMON-NEXT: br label %[[EXIT]]
+; COMMON: [[EXIT]]:
; COMMON-NEXT: br label %[[MIDDLE_BLOCK:.*]]
; COMMON: [[MIDDLE_BLOCK]]:
-; COMMON-NEXT: br [[EXIT:label %.*]]
+; COMMON-NEXT: br [[EXIT1:label %.*]]
; COMMON: [[SCALAR_PH]]:
;
entry:
@@ -1052,11 +1052,11 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED-NEXT: store float 0.000000e+00, ptr [[TMP83]], align 4
; PRED-NEXT: br label %[[PRED_STORE_CONTINUE27]]
; PRED: [[PRED_STORE_CONTINUE27]]:
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX]], i64 [[TMP17]])
; PRED-NEXT: [[TMP84:%.*]] = extractelement <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; PRED-NEXT: [[TMP85:%.*]] = xor i1 [[TMP84]], true
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; PRED-NEXT: br i1 [[TMP85]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
; PRED-NEXT: br [[EXIT:label %.*]]
@@ -1347,10 +1347,10 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
; PRED-NEXT: [[TMP26:%.*]] = fptoui <vscale x 16 x float> [[TMP25]] to <vscale x 16 x i8>
; PRED-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[C]], i64 [[INDEX]]
; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP26]], ptr [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
; PRED-NEXT: [[TMP28:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; PRED-NEXT: [[TMP29:%.*]] = xor i1 [[TMP28]], true
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
; PRED-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
; PRED-NEXT: br [[EXIT:label %.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index a44cc09b8a8ea..65b2c348607ca 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -140,11 +140,11 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
; CHECK-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
; CHECK-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP33]]
; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> zeroinitializer, ptr [[TMP34]], i32 8, <vscale x 2 x i1> [[TMP23]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP36:%.*]] = xor i1 [[TMP35]], true
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
; CHECK-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
@@ -258,11 +258,11 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
; CHECK-NEXT: [[TMP37:%.*]] = ashr i64 [[TMP36]], 32
; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP37]]
; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP23]], ptr [[TMP38]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
; CHECK-NEXT: [[TMP39:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP40:%.*]] = xor i1 [[TMP39]], true
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
; CHECK-NEXT: br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index c3b0bc8c00a74..bc99d50b7b761 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -10,10 +10,10 @@ define i64 @test(ptr %a, ptr %b) #0 {
; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
-; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: Cost of 0 for VF 8: WIDEN-REDUCTION-PHI ir<{{.+}}> = phi vp<{{.+}}>, ir<{{.+}}>
; CHECK: Cost for VF 8: 30
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
-; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: Cost of 0 for VF 16: WIDEN-REDUCTION-PHI ir<{{.+}}> = phi vp<{{.+}}>, ir<{{.+}}>
; CHECK: Cost for VF 16: 56
; CHECK: LV: Selecting VF: 16
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 137e07336fd50..fa11706a12b0b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -124,10 +124,10 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
; PRED-NEXT: [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
; PRED-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP26]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]])
; PRED-NEXT: [[TMP25:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; PRED-NEXT: [[TMP27:%.*]] = xor i1 [[TMP25]], true
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
; PRED-NEXT: br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
; PRED-NEXT: br label %[[EXIT:.*]]
@@ -291,11 +291,11 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
; PRED-NEXT: store i32 1, ptr [[TMP23]], align 4
; PRED-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; PRED: [[PRED_STORE_CONTINUE2]]:
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]])
; PRED-NEXT: [[TMP24:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; PRED-NEXT: [[TMP25:%.*]] = xor i1 [[TMP24]], true
; PRED-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; PRED-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
; PRED-NEXT: br label %[[EXIT:.*]]
@@ -480,11 +480,11 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
; PRED-NEXT: store i32 [[TMP34]], ptr [[TMP33]], align 4
; PRED-NEXT: br label %[[PRED_STORE_CONTINUE7]]
; PRED: [[PRED_STORE_CONTINUE7]]:
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]])
; PRED-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; PRED-NEXT: [[TMP36:%.*]] = xor i1 [[TMP35]], true
; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; PRED-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
; PRED-NEXT: br label %[[EXIT:.*]]
@@ -671,11 +671,11 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
; PRED-NEXT: store i32 [[TMP33]], ptr [[TMP32]], align 4
; PRED-NEXT: br label %[[PRED_STORE_CONTINUE6]]
; PRED: [[PRED_STORE_CONTINUE6]]:
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]])
; PRED-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; PRED-NEXT: [[TMP35:%.*]] = xor i1 [[TMP34]], true
; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; PRED-NEXT: br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
; PRED-NEXT: br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
index aed1c3d9fcc4c..53f477e828136 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
@@ -85,10 +85,10 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
; TFCOMMON-NEXT: store double [[TMP19]], ptr [[P]], align 8
; TFCOMMON-NEXT: br label [[PRED_STORE_CONTINUE6]]
; TFCOMMON: pred.store.continue2:
-; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP3]])
; TFCOMMON-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; TFCOMMON-NEXT: [[TMP17:%.*]] = xor i1 [[TMP15]], true
+; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; TFCOMMON-NEXT: br i1 [[TMP17]], label [[END:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
; TFCOMMON: end:
; TFCOMMON-NEXT: ret void
@@ -149,12 +149,12 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
; TFA_INTERLEAVE-NEXT: store double [[TMP34]], ptr [[P]], align 8
; TFA_INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE9]]
; TFA_INTERLEAVE: pred.store.continue9:
-; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 2
; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP3]])
; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT10]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[TMP27]], i64 [[TMP3]])
; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = xor i1 [[TMP26]], true
+; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; TFA_INTERLEAVE-NEXT: br i1 [[TMP28]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; TFA_INTERLEAVE: end:
; TFA_INTERLEAVE-NEXT: ret void
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 666057b18ccd0..939c2d5225397 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -992,11 +992,11 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
; TFA_INTERLEAVE-NEXT: store double [[SPEC_SELECT]], ptr [[P]], align 8
; TFA_INTERLEAVE-NEXT: br label %[[TMP13]]
; TFA_INTERLEAVE: [[TMP13]]:
-; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 1
; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP3]]
; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = icmp ult i64 [[TMP20]], [[TMP3]]
; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = xor i1 [[ACTIVE_LANE_MASK_NEXT]], true
+; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; TFA_INTERLEAVE-NEXT: br i1 [[TMP21]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; TFA_INTERLEAVE: [[END]]:
; TFA_INTERLEAVE-NEXT: ret void
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
index 67e6902b5d32a..2aef013662ad8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
@@ -384,9 +384,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
; DEFAULT-NEXT: store i8 [[TMP71]], ptr [[TMP70]], align 1
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE35]]
; DEFAULT: [[PRED_STORE_CONTINUE35]]:
-; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
; DEFAULT-NEXT: [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16)
+; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]]
@@ -523,11 +523,11 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
; DEFAULT-NEXT: [[TMP21:%.*]] = add <vscale x 16 x i8> [[TMP18]], [[TMP20]]
; DEFAULT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
; DEFAULT-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; DEFAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; DEFAULT-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; DEFAULT-NEXT: [[TMP24:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; DEFAULT-NEXT: [[TMP23:%.*]] = xor i1 [[TMP24]], true
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i8> [[VEC_IND]], [[DOTSPLAT]]
+; DEFAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; DEFAULT-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]]
@@ -590,11 +590,11 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
; OPTSIZE-NEXT: [[TMP21:%.*]] = add <vscale x 16 x i8> [[TMP18]], [[TMP20]]
; OPTSIZE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
; OPTSIZE-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; OPTSIZE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; OPTSIZE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; OPTSIZE-NEXT: [[TMP24:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; OPTSIZE-NEXT: [[TMP23:%.*]] = xor i1 [[TMP24]], true
; OPTSIZE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i8> [[VEC_IND]], [[DOTSPLAT]]
+; OPTSIZE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; OPTSIZE-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; OPTSIZE: [[MIDDLE_BLOCK]]:
; OPTSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]]
@@ -657,11 +657,11 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
; MINSIZE-NEXT: [[TMP21:%.*]] = add <vscale x 16 x i8> [[TMP18]], [[TMP20]]
; MINSIZE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
; MINSIZE-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; MINSIZE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; MINSIZE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; MINSIZE-NEXT: [[TMP24:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; MINSIZE-NEXT: [[TMP23:%.*]] = xor i1 [[TMP24]], true
; MINSIZE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i8> [[VEC_IND]], [[DOTSPLAT]]
+; MINSIZE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; MINSIZE-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; MINSIZE: [[MIDDLE_BLOCK]]:
; MINSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 792249d7829b0..22215f4c89ebf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1401,10 +1401,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
; CHECK-INTERLEAVE1-NEXT: [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
@@ -1438,10 +1438,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
; CHECK-INTERLEAVED-NEXT: [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
@@ -1475,10 +1475,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP16]], [[TMP13]]
; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP17]], <vscale x 16 x i32> zeroinitializer
; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP18]])
-; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor i1 [[TMP19]], true
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 8b2da8c4a7047..6161de23ec432 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -233,10 +233,10 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
; PRED-NEXT: [[TMP39:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]]
; PRED-NEXT: [[TMP40:%.*]] = or <vscale x 4 x i32> [[TMP39]], [[WIDE_MASKED_GATHER7]]
; PRED-NEXT: [[TMP41]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP40]], <vscale x 4 x i32> [[VEC_PHI]]
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[IV]], [[TMP2]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[IV]], i64 [[TMP10]])
; PRED-NEXT: [[TMP43:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; PRED-NEXT: [[TMP42:%.*]] = xor i1 [[TMP43]], true
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[IV]], [[TMP2]]
; PRED-NEXT: br i1 [[TMP42]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
; PRED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[TMP41]])
@@ -456,10 +456,10 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
; PRED-NEXT: [[TMP20:%.*]] = udiv <vscale x 8 x i16> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
; PRED-NEXT: [[TMP21:%.*]] = or <vscale x 8 x i16> [[TMP20]], [[VEC_PHI]]
; PRED-NEXT: [[TMP16]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[TMP21]], <vscale x 8 x i16> [[VEC_PHI]]
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP12]])
; PRED-NEXT: [[TMP15:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; PRED-NEXT: [[TMP17:%.*]] = xor i1 [[TMP15]], true
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
; PRED-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; PRED: [[MIDDLE_BLOCK]]:
; PRED-NEXT: [[TMP19:%.*]] = call i16 @llvm.vector.reduce.or.nxv8i16(<vscale x 8 x i16> [[TMP16]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index 375e412861777..6310ab4dca889 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -137,10 +137,10 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP7]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> splat (float -0.000000e+00)
; CHECK-ORDERED-TF-NEXT: [[TMP9]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP8]])
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP6]])
; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = xor i1 [[TMP10]], true
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-ORDERED-TF: middle.block:
; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]]
@@ -374,7 +374,6 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP26]], <vscale x 8 x float> [[TMP27]])
; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[WIDE_MASKED_LOAD11]], <vscale x 8 x float> splat (float -0.000000e+00)
; CHECK-ORDERED-TF-NEXT: [[TMP30]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP28]], <vscale x 8 x float> [[TMP29]])
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = shl nuw i64 [[TMP31]], 3
; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], [[TMP32]]
@@ -390,6 +389,7 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP39]], i64 [[TMP6]])
; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = xor i1 [[TMP40]], true
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-ORDERED-TF: middle.block:
; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]]
@@ -626,10 +626,10 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
; CHECK-ORDERED-TF-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP11]], <vscale x 4 x float> splat (float -0.000000e+00)
; CHECK-ORDERED-TF-NEXT: [[TMP16]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], <vscale x 4 x float> [[TMP15]])
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-ORDERED-TF: middle.block:
; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]]
@@ -859,10 +859,10 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = fadd <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP10]], <vscale x 4 x float> splat (float -0.000000e+00)
; CHECK-ORDERED-TF-NEXT: [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP11]])
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP7]])
; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-ORDERED-TF: middle.block:
; CHECK-ORDERED-TF-NEXT: br label [[FOR_END_LOOPEXIT:%.*]]
@@ -1077,10 +1077,10 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> [[WIDE_MASKED_LOAD1]], <vscale x 4 x float> splat (float 3.000000e+00)
; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> splat (float -0.000000e+00)
; CHECK-ORDERED-TF-NEXT: [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP11]])
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]])
; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-ORDERED-TF: middle.block:
; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]]
@@ -1526,7 +1526,6 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP40]], <vscale x 8 x float> [[TMP41]])
; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[TMP36]], <vscale x 8 x float> splat (float -0.000000e+00)
; CHECK-ORDERED-TF-NEXT: [[TMP44]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP42]], <vscale x 8 x float> [[TMP43]])
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = shl nuw i64 [[TMP45]], 3
; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = add i64 [[INDEX]], [[TMP46]]
@@ -1542,6 +1541,7 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP53]], i64 [[TMP6]])
; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = xor i1 [[TMP54]], true
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-ORDERED-TF: middle.block:
; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]]
@@ -1836,7 +1836,6 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP40]], <vscale x 8 x float> [[TMP41]])
; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[TMP36]], <vscale x 8 x float> splat (float -0.000000e+00)
; CHECK-ORDERED-TF-NEXT: [[TMP44]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP42]], <vscale x 8 x float> [[TMP43]])
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = shl nuw i64 [[TMP45]], 3
; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = add i64 [[INDEX]], [[TMP46]]
@@ -1852,6 +1851,7 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP53]], i64 [[TMP6]])
; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = xor i1 [[TMP54]], true
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK-ORDERED-TF: middle.block:
; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
index f6de370874d12..5d0cedf2263c3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -93,10 +93,10 @@ define void @cost_store_i8(ptr %dst) #0 {
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; PRED-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> zeroinitializer, ptr [[TMP13]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
; PRED-NEXT: [[TMP14:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; PRED-NEXT: [[TMP12:%.*]] = xor i1 [[TMP14]], true
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; PRED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; PRED: middle.block:
; PRED-NEXT: br label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 32235860dd9e2..b5c8e26092dcc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -109,10 +109,10 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP18]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]])
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP19]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
; PREDICATED_TAIL_FOLDING: middle.block:
; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]]
@@ -239,10 +239,10 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP13]] to <vscale x 16 x i64>
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP12]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
; PREDICATED_TAIL_FOLDING: middle.block:
; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]]
@@ -373,10 +373,10 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = zext nneg <vscale x 16 x i32> [[TMP15]] to <vscale x 16 x i64>
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP16]]
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP14]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
; PREDICATED_TAIL_FOLDING: middle.block:
; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]]
@@ -533,10 +533,10 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP19]], <vscale x 16 x i8> [[TMP20]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]])
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
; PREDICATED_TAIL_FOLDING: middle.block:
; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index ad9eefd8ee6a1..54c255f961eee 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -62,10 +62,10 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = xor i1 [[TMP15]], true
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
index a22f065415307..5d696d1d96d06 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
@@ -84,9 +84,9 @@ define void @can_overflow_i64_induction_var(ptr noalias %dst, ptr readonly %src,
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 42)
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
index 821b9fbbda78f..bb7a78f087110 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
@@ -29,10 +29,10 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: [[TMP14]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
@@ -75,10 +75,10 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 {
; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
; CHECK-IN-LOOP-NEXT: [[TMP15]] = add i32 [[VEC_PHI]], [[TMP14]]
-; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]]
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-IN-LOOP-NEXT: [[TMP19:%.*]] = xor i1 [[TMP18]], true
+; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]]
; CHECK-IN-LOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-IN-LOOP: middle.block:
; CHECK-IN-LOOP-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]]
@@ -137,10 +137,10 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
; CHECK-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float -0.000000e+00)
; CHECK-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]]
@@ -181,10 +181,10 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 {
; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float -0.000000e+00)
; CHECK-IN-LOOP-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
-; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true
+; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
; CHECK-IN-LOOP-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-IN-LOOP: middle.block:
; CHECK-IN-LOOP-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]]
@@ -247,10 +247,10 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
; CHECK-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD1]]
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[TMP17]], <vscale x 4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[TMP20]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP18:%.*]] = xor i1 [[TMP16]], true
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]]
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP20]])
@@ -304,10 +304,10 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD1]], <vscale x 4 x i32> zeroinitializer
; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP17]])
; CHECK-IN-LOOP-NEXT: [[TMP19]] = xor i32 [[VEC_PHI]], [[TMP18]]
-; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP21]]
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-IN-LOOP-NEXT: [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-IN-LOOP-NEXT: [[TMP23:%.*]] = xor i1 [[TMP22]], true
+; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP21]]
; CHECK-IN-LOOP-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-IN-LOOP: middle.block:
; CHECK-IN-LOOP-NEXT: br label [[FOR_END:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
index 90224caa68cd4..e1d2a4bbe8cde 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
@@ -53,7 +53,6 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP54]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]])
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP57]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]])
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP60]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]])
-; CHECK-NEXT: [[INDEX_NEXT10]] = add i64 [[INDEX6]], [[TMP62]]
; CHECK-NEXT: [[TMP63:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP64:%.*]] = shl nuw i64 [[TMP63]], 2
; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX6]], [[TMP64]]
@@ -69,6 +68,7 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP71]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP36:%.*]] = xor i1 [[TMP35]], true
+; CHECK-NEXT: [[INDEX_NEXT10]] = add i64 [[INDEX6]], [[TMP62]]
; CHECK-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]]
@@ -160,7 +160,6 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP76]], i32 4, <vscale x 4 x i1> [[TMP70]])
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP79]], i32 4, <vscale x 4 x i1> [[TMP71]])
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP82]], i32 4, <vscale x 4 x i1> [[TMP72]])
-; CHECK-NEXT: [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP6]]
; CHECK-NEXT: [[TMP85:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP86:%.*]] = shl nuw i64 [[TMP85]], 2
; CHECK-NEXT: [[TMP87:%.*]] = add i64 [[INDEX6]], [[TMP86]]
@@ -176,6 +175,7 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP93]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP66:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP67:%.*]] = xor i1 [[TMP66]], true
+; CHECK-NEXT: [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP6]]
; CHECK-NEXT: br i1 [[TMP67]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index 552d9e23c33c5..e487a91cea5c3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -26,10 +26,10 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = xor i1 [[TMP13]], true
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]]
@@ -69,10 +69,10 @@ define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], 4
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX1]], i64 [[TMP2]])
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = xor i1 [[TMP6]], true
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], 4
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]]
@@ -116,10 +116,10 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[INDEX1]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP12]], true
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]]
@@ -175,11 +175,11 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP20]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP12]])
; CHECK-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]]
@@ -227,10 +227,10 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]]
@@ -280,10 +280,10 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = xor i1 [[TMP14]], true
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_END:%.*]]
@@ -338,10 +338,10 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX1]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_END:%.*]]
@@ -399,10 +399,10 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n)
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = xor i1 [[TMP13]], true
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_END:%.*]]
@@ -450,10 +450,10 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
; CHECK-NEXT: [[TMP15:%.*]] = fdiv <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP15]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true
+; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP1]]
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]]
@@ -505,10 +505,10 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD2]], <vscale x 4 x i32> splat (i32 1)
; CHECK-NEXT: [[TMP16:%.*]] = udiv <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[TMP15]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP16]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP17:%.*]] = xor i1 [[TMP14]], true
+; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP1]]
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
index 3b19e9ee1a5a3..0ae1d7ad41aa2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
@@ -555,9 +555,9 @@ define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indic
; CHECK-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]])
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
index 150b79c448005..410d167e75c61 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
@@ -27,10 +27,10 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IDX]]
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[IDX]], 4
; CHECK-NEXT: [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX]], i64 [[N2]])
; CHECK-NEXT: [[EXTRACT_FIRST_LANE_MASK:%.*]] = extractelement <4 x i1> [[NEXT_ACTIVE_LANE_MASK]], i32 0
; CHECK-NEXT: [[FIRST_LANE_SET:%.*]] = xor i1 [[EXTRACT_FIRST_LANE_MASK]], true
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[IDX]], 4
; CHECK-NEXT: br i1 [[FIRST_LANE_SET]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br [[FOR_END:label %.*]]
@@ -81,10 +81,10 @@ define void @cond_uniform_load(ptr noalias nocapture %dst, ptr nocapture readonl
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[MASK]], <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX6]]
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[PREDPHI]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX6]], 4
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX6]], i64 [[TMP3]])
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = xor i1 [[TMP8]], true
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX6]], 4
; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br [[FOR_END:label %.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
index 24d23d81b1f92..9a2d3a4bdc44b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@@ -184,10 +184,10 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP12:%.*]] = xor i1 [[TMP15]], true
+; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; DATA_AND_CONTROL_NO_RT_CHECK: middle.block:
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
index e9de5e21228fd..9c886a4128c62 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
@@ -25,9 +25,9 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6
; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
@@ -63,13 +63,13 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6
; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[DOTIDX5]]
; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP11]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
-; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; INTERLEAVE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; INTERLEAVE-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 1
; INTERLEAVE-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]]
; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP17]], i64 [[TMP4]])
; INTERLEAVE-NEXT: [[TMP18:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; INTERLEAVE-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
; INTERLEAVE: for.cond.cleanup:
; INTERLEAVE-NEXT: ret void
@@ -111,9 +111,9 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3
; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
@@ -149,13 +149,13 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3
; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[DOTIDX5]]
; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP11]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
-; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; INTERLEAVE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; INTERLEAVE-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 1
; INTERLEAVE-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]]
; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP17]], i64 [[TMP4]])
; INTERLEAVE-NEXT: [[TMP18:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; INTERLEAVE-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
; INTERLEAVE: for.cond.cleanup:
; INTERLEAVE-NEXT: ret void
@@ -227,10 +227,10 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
; INTERLEAVE-NEXT: store double [[TMP8]], ptr [[TMP9]], align 8
; INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE4]]
; INTERLEAVE: pred.store.continue4:
-; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; INTERLEAVE-NEXT: [[TMP10:%.*]] = or disjoint i64 [[INDEX]], 1
; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]]
; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = icmp ult i64 [[TMP10]], [[TMP0]]
+; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; INTERLEAVE-NEXT: br i1 [[ACTIVE_LANE_MASK_NEXT]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
; INTERLEAVE: for.cond.cleanup:
; INTERLEAVE-NEXT: ret void
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
index bdf832f32964f..f92cbc029ffe4 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
@@ -22,7 +22,6 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
; CHECK: Cost of 1 for VF 2: induction instruction %inc = add nuw nsw i32 %i.016, 1
; CHECK: Cost of 0 for VF 2: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n
-; CHECK: Cost of 0 for VF 2: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 2: vp<{{.+}}> = SCALAR-STEPS vp<{{.+}}>, ir<1>
; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}>
; CHECK: Cost of 0 for VF 2: vp<{{.+}}> = vector-pointer ir<%arrayidx>
@@ -39,7 +38,6 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
; CHECK: Cost of 1 for VF 4: induction instruction %inc = add nuw nsw i32 %i.016, 1
; CHECK: Cost of 0 for VF 4: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n
-; CHECK: Cost of 0 for VF 4: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 4: vp<{{.+}}> = SCALAR-STEPS vp<{{.+}}>, ir<1>
; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}>
; CHECK: Cost of 0 for VF 4: vp<{{.+}}> = vector-pointer ir<%arrayidx>
@@ -56,7 +54,6 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
; CHECK: Cost of 1 for VF 8: induction instruction %inc = add nuw nsw i32 %i.016, 1
; CHECK: Cost of 0 for VF 8: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
; CHECK: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n
-; CHECK: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 8: vp<{{.+}}> = SCALAR-STEPS vp<{{.+}}>, ir<1>
; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}>
; CHECK: Cost of 0 for VF 8: vp<{{.+}}> = vector-pointer ir<%arrayidx>
@@ -135,8 +132,7 @@ for.inc: ; preds = %for.body, %if.then
; CHECK: Cost of 0 for VF 2: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
; CHECK: Cost of 0 for VF 2: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
; CHECK: Cost of 1 for VF 2: exit condition instruction %cmp.not = icmp eq i32 %dec, 0
-; CHECK: Cost of 0 for VF 2: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost of 0 for VF 2: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK: Cost of 0 for VF 2: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV:%.+]]>, ir<1>
; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]>
; CHECK: Cost of 0 for VF 2: vp<[[STEPS2:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<[[STEPS2]]>
@@ -167,8 +163,7 @@ for.inc: ; preds = %for.body, %if.then
; CHECK: Cost of 0 for VF 4: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
; CHECK: Cost of 0 for VF 4: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
; CHECK: Cost of 1 for VF 4: exit condition instruction %cmp.not = icmp eq i32 %dec, 0
-; CHECK: Cost of 0 for VF 4: EMIT vp<[[CAN_IV:%.]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost of 0 for VF 4: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK: Cost of 0 for VF 4: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV:%.+]]>, ir<1>
; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]>
; CHECK: Cost of 0 for VF 4: vp<[[STEPS2:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<[[STEPS2]]>
@@ -199,8 +194,7 @@ for.inc: ; preds = %for.body, %if.then
; CHECK: Cost of 0 for VF 8: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
; CHECK: Cost of 0 for VF 8: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
; CHECK: Cost of 1 for VF 8: exit condition instruction %cmp.not = icmp eq i32 %dec, 0
-; CHECK: Cost of 0 for VF 8: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost of 0 for VF 8: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK: Cost of 0 for VF 8: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV:%.+]]>, ir<1>
; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]>
; CHECK: Cost of 0 for VF 8: vp<[[STEPS2:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<[[STEPS2]]>
@@ -231,8 +225,7 @@ for.inc: ; preds = %for.body, %if.then
; CHECK: Cost of 0 for VF 16: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
; CHECK: Cost of 0 for VF 16: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
; CHECK: Cost of 1 for VF 16: exit condition instruction %cmp.not = icmp eq i32 %dec, 0
-; CHECK: Cost of 0 for VF 16: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost of 0 for VF 16: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK: Cost of 0 for VF 16: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV:%.+]]>, ir<1>
; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]>
; CHECK: Cost of 0 for VF 16: vp<[[STEPS2:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<[[STEPS2]]>
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
index dcf4bee728b29..e379420c5cdfa 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
@@ -380,9 +380,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
; DEFAULT-NEXT: store i8 [[TMP71]], ptr [[TMP70]], align 1
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE35]]
; DEFAULT: [[PRED_STORE_CONTINUE35]]:
-; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
; DEFAULT-NEXT: [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16)
+; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll
index bb85b88f181f7..d72e33a0355a2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll
@@ -10,7 +10,6 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst)
; CHECK: Cost of 1 for VF 2: induction instruction %iv.next = add nuw nsw i64 %iv, 1
; CHECK: Cost of 0 for VF 2: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32
-; CHECK: Cost of 0 for VF 2: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 2: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<%0>
; CHECK: Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
; CHECK: Cost of 0 for VF 2: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<%4>
@@ -28,7 +27,6 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst)
; CHECK: Cost of 1 for VF 4: induction instruction %iv.next = add nuw nsw i64 %iv, 1
; CHECK: Cost of 0 for VF 4: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32
-; CHECK: Cost of 0 for VF 4: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 4: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<%0>
; CHECK: Cost of 0 for VF 4: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
; CHECK: Cost of 0 for VF 4: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<%4>
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
index 615f50124b41d..85f0e326b9663 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
@@ -449,8 +449,8 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) {
; IC4VF4-NEXT: [[TMP113:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> [[TMP109]], <4 x i16> [[VEC_PHI1]]
; IC4VF4-NEXT: [[TMP114:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> [[TMP110]], <4 x i16> [[VEC_PHI2]]
; IC4VF4-NEXT: [[TMP115:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP111]], <4 x i16> [[VEC_PHI3]]
-; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD_3]], splat (i16 -4)
+; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
; IC4VF4-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; IC4VF4: [[MIDDLE_BLOCK]]:
; IC4VF4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP112]], <4 x i16> [[TMP113]])
@@ -814,8 +814,8 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) {
; IC4VF4-NEXT: [[TMP113:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> [[TMP109]], <4 x i16> [[VEC_PHI1]]
; IC4VF4-NEXT: [[TMP114:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> [[TMP110]], <4 x i16> [[VEC_PHI2]]
; IC4VF4-NEXT: [[TMP115:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP111]], <4 x i16> [[VEC_PHI3]]
-; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD_3]], splat (i16 -4)
+; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
; IC4VF4-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; IC4VF4: [[MIDDLE_BLOCK]]:
; IC4VF4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP112]], <4 x i16> [[TMP113]])
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
index 70e730f0284c0..2f580956731ed 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
@@ -63,8 +63,8 @@ define void @ptr_depends_on_sdiv(ptr noalias %dst, i16 noundef %off) {
; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]]
; CHECK: [[PRED_STORE_CONTINUE4]]:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br [[EXIT:label %.*]]
@@ -139,8 +139,8 @@ define void @ptr_depends_on_possibly_poison_value(ptr noalias %dst, i16 %off) {
; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP12]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; CHECK: [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br [[EXIT:label %.*]]
@@ -209,8 +209,8 @@ define void @ptr_doesnt_depend_on_poison_or_ub(ptr noalias %dst, i16 noundef %of
; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP12]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; CHECK: [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br [[EXIT:label %.*]]
@@ -285,8 +285,8 @@ define void @ptr_depends_on_possibly_poison_value_from_load(ptr noalias %dst) {
; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP13]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; CHECK: [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br [[EXIT:label %.*]]
@@ -358,8 +358,8 @@ define void @ptr_depends_on_noundef_load(ptr noalias %dst) {
; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP13]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; CHECK: [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br [[EXIT:label %.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 87b11dc77d417..c176d69d4ae47 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -21,8 +21,9 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0>
+; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
@@ -31,7 +32,7 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx2>
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%call>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -89,8 +90,9 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x,
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0>
+; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
; CHECK-NEXT: WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr inbounds ir<%y>, ir<%iv>
@@ -102,7 +104,7 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x,
; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx2>
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -163,8 +165,9 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) {
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0>
+; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: ir<%i> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
; CHECK-NEXT: WIDEN ir<%cmp> = icmp ult ir<%i>, ir<5>
@@ -191,7 +194,7 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) {
; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx>
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%d>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -258,8 +261,9 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0>
+; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<0> + vp<[[CAN_IV]]> * ir<4>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<4>, vp<[[VF]]>
; CHECK-NEXT: CLONE ir<%gep.AB.0> = getelementptr inbounds ir<@AB>, ir<0>, vp<[[STEPS]]>
@@ -274,7 +278,7 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
; CHECK-NEXT: store ir<1> to index 1
; CHECK-NEXT: store ir<2> to index 2
; CHECK-NEXT: store ir<%AB.3> to index 3
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -344,8 +348,9 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0>
+; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: CLONE ir<%isd> = getelementptr inbounds ir<%asd>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%isd>
@@ -379,7 +384,7 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db
; CHECK-NEXT: BLEND ir<%ysd.0> = vp<[[PHI]]> ir<%psd>/vp<[[SEL2]]>
; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%isd>
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%ysd.0>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT:}
@@ -456,15 +461,16 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0>
+; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, vp<[[EXP_SCEV]]>, vp<[[VF]]> (truncated to i8)
; CHECK-NEXT: vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<0> + vp<[[CAN_IV]]> * vp<[[EXP_SCEV]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, vp<[[EXP_SCEV]]>
; CHECK-NEXT: WIDEN ir<%v3> = add nuw ir<%iv>, ir<1>
; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]>
; CHECK-NEXT: REPLICATE store ir<%v3>, ir<%gep>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -523,15 +529,16 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0>
+; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
; CHECK-NEXT: CLONE ir<%gep> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]>
; CHECK-NEXT: WIDEN ir<%add> = add ir<%iv>, ir<%off>
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -589,8 +596,9 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr %
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0>
+; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: CLONE ir<%gep.y> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.y>
@@ -601,7 +609,7 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr %
; CHECK-NEXT: CLONE ir<%gep.x> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x>
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%div>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -660,8 +668,9 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) {
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0>
+; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: CLONE ir<%gep.x> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x>
@@ -671,7 +680,7 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) {
; CHECK-NEXT: WIDEN ir<%add> = add nuw nsw ir<%div.1>, ir<%div.2>
; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%gep.x>
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -729,8 +738,9 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) {
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0>
+; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: CLONE ir<%ld.addr> = getelementptr inbounds ir<%src>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%ld.addr>
@@ -761,7 +771,7 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) {
; CHECK-NEXT: CLONE ir<%st.addr> = getelementptr inbounds ir<%dest>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%st.addr>
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%st.value>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -829,8 +839,9 @@ define void @print_disjoint_flags(i64 %n, ptr noalias %x) {
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0>
+; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: CLONE ir<%gep.x> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x>
@@ -840,7 +851,7 @@ define void @print_disjoint_flags(i64 %n, ptr noalias %x) {
; CHECK-NEXT: WIDEN ir<%add> = add nuw nsw ir<%or.1>, ir<%or.2>
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x>
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%add>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -898,15 +909,16 @@ define void @zext_nneg(ptr noalias %p, ptr noalias %p1) {
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0>
+; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%p>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx>
; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]>
; CHECK-NEXT: WIDEN-CAST ir<%zext> = zext nneg ir<%l>
; CHECK-NEXT: REPLICATE store ir<%zext>, ir<%p1>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -943,8 +955,9 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0>
+; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.1> = phi ir<22>, ir<%for.1.next>
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: CLONE ir<%gep.ptr> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]>
@@ -1016,8 +1029,9 @@ define void @print_select_with_fastmath_flags(ptr noalias %a, ptr noalias %b, pt
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
; CHECK: <x1> vector loop: {
+; CHECK-NEXT: vp<[[IV:%.+]]> = CANONICAL-IV ir<0>
+; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT_EXIT:%.+]]>
; CHECK-NEXT: vp<[[ST:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds nuw ir<%b>, vp<[[ST]]>
; CHECK-NEXT: vp<[[PTR1:%.+]]> = vector-pointer ir<[[GEP1]]>
@@ -1031,7 +1045,7 @@ define void @print_select_with_fastmath_flags(ptr noalias %a, ptr noalias %b, pt
; CHECK-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds nuw ir<%a>, vp<[[ST]]>
; CHECK-NEXT: vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]>
; CHECK-NEXT: WIDEN store vp<[[PTR3]]>, ir<[[SELECT]]>
-; CHECK-NEXT: EMIT vp<[[IV_NEXT_EXIT]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; CHECK-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
index a943e7ac12b1b..91168d6de21fe 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
@@ -57,9 +57,7 @@ TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) {
EXPECT_EQ(&*Plan, VecBB->getPlan());
auto Iter = VecBB->begin();
- auto *CanIV = dyn_cast<VPCanonicalIVPHIRecipe>(&*Iter++);
- EXPECT_NE(nullptr, CanIV);
- auto *Phi = dyn_cast<VPPhi>(&*Iter++);
+ VPWidenPHIRecipe *Phi = dyn_cast<VPWidenPHIRecipe>(&*Iter++);
EXPECT_NE(nullptr, Phi);
VPInstruction *Idx = dyn_cast<VPInstruction>(&*Iter++);
@@ -218,7 +216,6 @@ TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) {
EXPECT_EQ(VecBB->getParent()->getEntryBasicBlock(), VecBB);
auto Iter = VecBB->begin();
- EXPECT_NE(nullptr, dyn_cast<VPCanonicalIVPHIRecipe>(&*Iter++));
EXPECT_NE(nullptr, dyn_cast<VPWidenPHIRecipe>(&*Iter++));
EXPECT_NE(nullptr, dyn_cast<VPWidenGEPRecipe>(&*Iter++));
EXPECT_NE(nullptr, dyn_cast<VPWidenMemoryRecipe>(&*Iter++));
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp
index e38b4fad80b0e..7fdeb7328bee7 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp
@@ -23,32 +23,31 @@ using VPPatternMatchTest = VPlanTestBase;
TEST_F(VPPatternMatchTest, ScalarIVSteps) {
VPlan &Plan = getPlan();
+ IntegerType *I64Ty = IntegerType::get(C, 64);
+ VPRegionBlock *VPR =
+ Plan.createVPRegionBlock(DebugLoc::getCompilerGenerated(), "");
+ VPValue *CanIV = VPR->getCanonicalIV();
VPBasicBlock *VPBB = Plan.createVPBasicBlock("");
VPBuilder Builder(VPBB);
- IntegerType *I64Ty = IntegerType::get(C, 64);
- VPValue *StartV = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0));
- auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DebugLoc());
- Builder.insert(CanonicalIVPHI);
-
VPValue *Inc = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
VPValue *VF = &Plan.getVF();
- VPValue *Steps = Builder.createScalarIVSteps(
- Instruction::Add, nullptr, CanonicalIVPHI, Inc, VF, DebugLoc());
+ VPValue *Steps = Builder.createScalarIVSteps(Instruction::Add, nullptr, CanIV,
+ Inc, VF, DebugLoc());
VPValue *Inc2 = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 2));
- VPValue *Steps2 = Builder.createScalarIVSteps(
- Instruction::Add, nullptr, CanonicalIVPHI, Inc2, VF, DebugLoc());
+ VPValue *Steps2 = Builder.createScalarIVSteps(Instruction::Add, nullptr,
+ CanIV, Inc2, VF, DebugLoc());
using namespace VPlanPatternMatch;
- ASSERT_TRUE(match(Steps, m_ScalarIVSteps(m_Specific(CanonicalIVPHI),
- m_SpecificInt(1), m_Specific(VF))));
+ ASSERT_TRUE(match(Steps, m_ScalarIVSteps(m_Specific(CanIV), m_SpecificInt(1),
+ m_Specific(VF))));
ASSERT_FALSE(
- match(Steps2, m_ScalarIVSteps(m_Specific(CanonicalIVPHI),
- m_SpecificInt(1), m_Specific(VF))));
- ASSERT_TRUE(match(Steps2, m_ScalarIVSteps(m_Specific(CanonicalIVPHI),
- m_SpecificInt(2), m_Specific(VF))));
+ match(Steps2, m_ScalarIVSteps(m_Specific(CanIV), m_SpecificInt(1),
+ m_Specific(VF))));
+ ASSERT_TRUE(match(Steps2, m_ScalarIVSteps(m_Specific(CanIV), m_SpecificInt(2),
+ m_Specific(VF))));
}
} // namespace
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
index c2f045bf524e9..b4de7b8780e3b 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
@@ -24,14 +24,12 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero});
VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI});
- auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
VPBasicBlock *VPBB1 = Plan.getEntry();
VPBB1->appendRecipe(UseI);
VPBB1->appendRecipe(DefI);
VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
- VPBB2->appendRecipe(CanIV);
VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
VPBlockUtils::connectBlocks(VPBB1, R1);
VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
@@ -59,15 +57,13 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero});
VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI});
- auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
VPInstruction *BranchOnCond =
- new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
+ new VPInstruction(VPInstruction::BranchOnCond, {UseI});
VPBasicBlock *VPBB1 = Plan.getEntry();
VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
VPBB1->appendRecipe(UseI);
- VPBB2->appendRecipe(CanIV);
VPBB2->appendRecipe(DefI);
VPBB2->appendRecipe(BranchOnCond);
@@ -100,9 +96,8 @@ TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 0));
VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero});
- auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
VPInstruction *BranchOnCond =
- new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
+ new VPInstruction(VPInstruction::BranchOnCond, {DefI});
auto *Blend = new VPBlendRecipe(Phi, {DefI}, {});
VPBasicBlock *VPBB1 = Plan.getEntry();
@@ -110,7 +105,6 @@ TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("");
VPBasicBlock *VPBB4 = Plan.createVPBasicBlock("");
- VPBB2->appendRecipe(CanIV);
VPBB3->appendRecipe(Blend);
VPBB4->appendRecipe(DefI);
VPBB4->appendRecipe(BranchOnCond);
@@ -157,8 +151,6 @@ TEST_F(VPVerifierTest, VPPhiIncomingValueDoesntDominateIncomingBlock) {
VPPhi *Phi = new VPPhi({DefI}, {});
VPBB2->appendRecipe(Phi);
VPBB2->appendRecipe(DefI);
- auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
- VPBB3->appendRecipe(CanIV);
VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB3, VPBB3, "R1");
VPBlockUtils::connectBlocks(VPBB1, VPBB2);
@@ -186,9 +178,8 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
VPlan &Plan = getPlan();
VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
VPInstruction *I1 = new VPInstruction(Instruction::Add, {Zero});
- auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
VPInstruction *BranchOnCond =
- new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
+ new VPInstruction(VPInstruction::BranchOnCond, {I1});
VPInstruction *BranchOnCond2 =
new VPInstruction(VPInstruction::BranchOnCond, {I1});
@@ -197,7 +188,6 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
VPBB1->appendRecipe(I1);
VPBB1->appendRecipe(BranchOnCond2);
- VPBB2->appendRecipe(CanIV);
VPBB2->appendRecipe(BranchOnCond);
VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
@@ -220,9 +210,8 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
VPlan &Plan = getPlan();
VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
VPInstruction *I1 = new VPInstruction(Instruction::Add, {Zero});
- auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
VPInstruction *BranchOnCond =
- new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
+ new VPInstruction(VPInstruction::BranchOnCond, {I1});
VPInstruction *BranchOnCond2 =
new VPInstruction(VPInstruction::BranchOnCond, {I1});
@@ -231,7 +220,6 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("");
VPBB1->appendRecipe(I1);
- VPBB2->appendRecipe(CanIV);
VPBB2->appendRecipe(BranchOnCond2);
VPBB3->appendRecipe(BranchOnCond);
@@ -260,8 +248,6 @@ TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
- auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
- VPBB2->appendRecipe(CanIV);
VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero});
VPInstruction *BranchOnCond =
@@ -289,14 +275,11 @@ TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
TEST_F(VPVerifierTest, NonHeaderPHIInHeader) {
VPlan &Plan = getPlan();
VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
- auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
- auto *BranchOnCond = new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
+ auto *BranchOnCond = new VPInstruction(VPInstruction::BranchOnCond, {Zero});
VPBasicBlock *VPBB1 = Plan.getEntry();
VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("header");
- VPBB2->appendRecipe(CanIV);
-
PHINode *PHINode = PHINode::Create(Type::getInt32Ty(C), 2);
auto *IRPhi = new VPIRPhi(*PHINode);
VPBB2->appendRecipe(IRPhi);
More information about the llvm-commits
mailing list