[llvm] [VPlan] Run narrowInterleaveGroups during general VPlan optimizations. (PR #149706)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 11 01:54:33 PDT 2025
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/149706
>From dc6be02e368d1177b0b9e8d275054ac10bb10eba Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 19 Jul 2025 13:11:10 +0100
Subject: [PATCH 1/2] [VPlan] Run narrowInterleaveGroups during general VPlan
optimizations.
Move narrowInterleaveGroups to to general VPlan optimization stage.
To do so, narrowInterleaveGroups now has to find a suitable VF where all
interleave groups are consecutive and saturate the full vector width.
If such a VF is found, the original VPlan is split into 2:
a) a new clone which contains all VFs of Plan, except VFToOptimize, and
b) the original Plan with VFToOptimize as single VF.
The original Plan is then optimized. If a new copy for the other VFs has
been created, it is returned and the caller has to add it to the list of
candidate plans.
Together with https://github.com/llvm/llvm-project/pull/149702, this
allows to take the narrowed interleave groups into account when
interleaving.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 11 +-
llvm/lib/Transforms/Vectorize/VPlan.cpp | 1 +
llvm/lib/Transforms/Vectorize/VPlan.h | 6 +
.../Transforms/Vectorize/VPlanTransforms.cpp | 91 ++++++++----
.../Transforms/Vectorize/VPlanTransforms.h | 21 ++-
...interleave-to-widen-memory-constant-ops.ll | 44 ++----
...-narrow-interleave-to-widen-memory-cost.ll | 56 +++----
...eave-to-widen-memory-remove-loop-region.ll | 8 +-
...arrow-interleave-to-widen-memory-unroll.ll | 32 ++--
...sform-narrow-interleave-to-widen-memory.ll | 6 +-
...sform-narrow-interleave-to-widen-memory.ll | 138 ++++++++++++------
11 files changed, 245 insertions(+), 169 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1bfd70ee53c7c..589f7aac29fb5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7223,9 +7223,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
VPlanTransforms::simplifyRecipes(BestVPlan);
VPlanTransforms::removeBranchOnConst(BestVPlan);
- VPlanTransforms::narrowInterleaveGroups(
- BestVPlan, BestVF,
- TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
VPlanTransforms::removeDeadRecipes(BestVPlan);
VPlanTransforms::convertToConcreteRecipes(BestVPlan);
@@ -8336,6 +8333,14 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
if (CM.foldTailWithEVL() && !HasScalarVF)
VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
*Plan, CM.getMaxSafeElements());
+
+ if (auto P = VPlanTransforms::narrowInterleaveGroups(
+ *Plan,
+ TTI.getRegisterBitWidth(
+ TargetTransformInfo::RGK_FixedWidthVector),
+ SubRange))
+ VPlans.push_back(std::move(P));
+
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1438dc366b55d..817b5c053bc0f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1201,6 +1201,7 @@ VPlan *VPlan::duplicate() {
}
Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
Old2NewVPValues[&VF] = &NewPlan->VF;
+ Old2NewVPValues[&UF] = &NewPlan->UF;
Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
if (BackedgeTakenCount) {
NewPlan->BackedgeTakenCount = new VPValue();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 055db2b9adb95..b4e117c83da11 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3970,6 +3970,9 @@ class VPlan {
/// Represents the vectorization factor of the loop.
VPValue VF;
+ /// Represents the symbolic unroll factor of the loop.
+ VPValue UF;
+
/// Represents the loop-invariant VF * UF of the vector loop region.
VPValue VFxUF;
@@ -4121,6 +4124,9 @@ class VPlan {
/// Returns the VF of the vector loop region.
VPValue &getVF() { return VF; };
+ /// Returns the symbolic UF of the vector loop region.
+ VPValue &getSymbolicUF() { return UF; };
+
/// Returns VF * UF of the vector loop region.
VPValue &getVFxUF() { return VFxUF; }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8d376557553c0..1ac2cc8ce9205 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3484,6 +3484,9 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
// used.
// TODO: Assert that they aren't used.
+ VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
+ Plan.getSymbolicUF().replaceAllUsesWith(UF);
+
// If there are no users of the runtime VF, compute VFxUF by constant folding
// the multiplication of VF and UF.
if (VF.getNumUsers() == 0) {
@@ -3503,7 +3506,6 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
}
VF.replaceAllUsesWith(RuntimeVF);
- VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
VFxUF.replaceAllUsesWith(MulByUF);
}
@@ -3612,16 +3614,18 @@ static bool isAlreadyNarrow(VPValue *VPV) {
return RepR && RepR->isSingleScalar();
}
-void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
- unsigned VectorRegWidth) {
+std::unique_ptr<VPlan>
+VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, unsigned VectorRegWidth,
+ VFRange &Range) {
+ using namespace llvm::VPlanPatternMatch;
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
+
if (!VectorLoop)
- return;
+ return nullptr;
VPTypeAnalysis TypeInfo(Plan);
-
- unsigned VFMinVal = VF.getKnownMinValue();
SmallVector<VPInterleaveRecipe *> StoreGroups;
+ std::optional<ElementCount> VFToOptimize;
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
if (isa<VPCanonicalIVPHIRecipe>(&R) ||
match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
@@ -3636,30 +3640,38 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// * recipes writing to memory except interleave groups
// Only support plans with a canonical induction phi.
if (R.isPhi())
- return;
+ return nullptr;
auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
if (R.mayWriteToMemory() && !InterleaveR)
- return;
-
- // Do not narrow interleave groups if there are VectorPointer recipes and
- // the plan was unrolled. The recipe implicitly uses VF from
- // VPTransformState.
- // TODO: Remove restriction once the VF for the VectorPointer offset is
- // modeled explicitly as operand.
- if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
- return;
+ return nullptr;
// All other ops are allowed, but we reject uses that cannot be converted
// when checking all allowed consumers (store interleave groups) below.
if (!InterleaveR)
continue;
- // Bail out on non-consecutive interleave groups.
- if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
- VectorRegWidth))
- return;
-
+ // Try to find a single VF, where all interleave groups are consecutive and
+ // saturate the full vector width. If we already have a candidate VF, check
+ // if it is applicable for the current InterleaveR, otherwise look for a
+ // suitable VF across the Plans VFs.
+ //
+ if (VFToOptimize) {
+ if (!isConsecutiveInterleaveGroup(InterleaveR,
+ VFToOptimize->getKnownMinValue(),
+ TypeInfo, VectorRegWidth))
+ return nullptr;
+ } else {
+ for (ElementCount VF : Plan.vectorFactors()) {
+ if (isConsecutiveInterleaveGroup(InterleaveR, VF.getKnownMinValue(),
+ TypeInfo, VectorRegWidth)) {
+ VFToOptimize = VF;
+ break;
+ }
+ }
+ if (!VFToOptimize)
+ return nullptr;
+ }
// Skip read interleave groups.
if (InterleaveR->getStoredValues().empty())
continue;
@@ -3693,24 +3705,44 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
InterleaveR->getStoredValues()[0]->getDefiningRecipe());
if (!WideMember0)
- return;
+ return nullptr;
for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
if (!R || R->getOpcode() != WideMember0->getOpcode() ||
R->getNumOperands() > 2)
- return;
+ return nullptr;
if (any_of(enumerate(R->operands()),
[WideMember0, Idx = I](const auto &P) {
const auto &[OpIdx, OpV] = P;
return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
}))
- return;
+ return nullptr;
}
StoreGroups.push_back(InterleaveR);
}
if (StoreGroups.empty())
- return;
+ return nullptr;
+
+ // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
+ // original Plan into 2: a) a new clone which contains all VFs of Plan, except
+ // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
+ std::unique_ptr<VPlan> NewPlan;
+ if (size(Plan.vectorFactors()) != 1) {
+ NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
+ Plan.setVF(*VFToOptimize);
+ bool First = true;
+ for (ElementCount VF : NewPlan->vectorFactors()) {
+ if (VF == VFToOptimize)
+ continue;
+ if (First) {
+ NewPlan->setVF(VF);
+ First = false;
+ continue;
+ }
+ NewPlan->addVF(VF);
+ }
+ }
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
auto NarrowOp = [](VPValue *V) -> VPValue * {
@@ -3777,9 +3809,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
VPBuilder PHBuilder(Plan.getVectorPreheader());
- VPValue *UF = Plan.getOrAddLiveIn(
- ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
- if (VF.isScalable()) {
+ VPValue *UF = &Plan.getSymbolicUF();
+ if (VFToOptimize->isScalable()) {
VPValue *VScale = PHBuilder.createElementCount(
CanIV->getScalarType(), ElementCount::getScalable(1));
VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
@@ -3791,6 +3822,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
}
removeDeadRecipes(Plan);
+ assert(none_of(*VectorLoop->getEntryBasicBlock(),
+ IsaPred<VPVectorPointerRecipe>) &&
+ "All VPVectorPointerRecipes should have been removed");
+ return NewPlan;
}
/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 9cf62a35ae36b..2e38aa99a56be 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -324,14 +324,19 @@ struct VPlanTransforms {
static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
ScalarEvolution &SE);
- /// Try to convert a plan with interleave groups with VF elements to a plan
- /// with the interleave groups replaced by wide loads and stores processing VF
- /// elements, if all transformed interleave groups access the full vector
- /// width (checked via \o VectorRegWidth). This effectively is a very simple
- /// form of loop-aware SLP, where we use interleave groups to identify
- /// candidates.
- static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
- unsigned VectorRegWidth);
+ /// Try to find a single VF among \p Plan's VFs for which all interleave
+ /// groups (with VF elements) can be replaced by wide loads ans tores
+ /// processing VF elements, if all transformed interleave groups access the
+ /// full vector width (checked via \o VectorRegWidth). If the transformation
+ /// can be applied, the original \p Plan will be split in 2, if is has
+ /// multiple VFs: a) a new clone which contains all VFs of Plan, except
+ /// VFToOptimize, and b) the original Plan with VFToOptimize as single VF. In
+ /// that case, the new clone is returned.
+ ///
+ /// This effectively is a very simple form of loop-aware SLP, where we use
+ /// interleave groups to identify candidates.
+ static std::unique_ptr<VPlan>
+ narrowInterleaveGroups(VPlan &Plan, unsigned VectorRegWidth, VFRange &Range);
/// Predicate and linearize the control-flow in the only loop region of
/// \p Plan. If \p FoldTail is true, create a mask guarding the loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
index 3215356bf6ce4..d6c4bba30ee67 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
@@ -175,28 +175,18 @@ define void @test_add_double_same_var_args_1(ptr %res, ptr noalias %A, ptr noali
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
@@ -237,28 +227,18 @@ define void @test_add_double_same_var_args_2(ptr %res, ptr noalias %A, ptr noali
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
-; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC3]]
+; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC1]]
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
index ac39ecff7f8e8..d19df6d58a811 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
@@ -326,46 +326,46 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 3
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 2
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 4
-; CHECK-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 6
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
-; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP25]], align 8
-; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP26]], align 8
-; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <2 x double>, ptr [[TMP27]], align 8
+; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP21]]
+; CHECK-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP20]]
+; CHECK-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP22]]
+; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr [[TMP23]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[TMP24]], i64 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP25:%.*]] = load double, ptr [[TMP33]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <2 x double> poison, double [[TMP25]], i64 0
+; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT12]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP26:%.*]] = load double, ptr [[TMP37]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <2 x double> poison, double [[TMP26]], i64 0
+; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT14]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP27:%.*]] = load double, ptr [[TMP39]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <2 x double> poison, double [[TMP27]], i64 0
+; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT16]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP28:%.*]] = fmul <2 x double> [[WIDE_LOAD]], splat (double 5.000000e+00)
; CHECK-NEXT: [[TMP29:%.*]] = fmul <2 x double> [[WIDE_LOAD12]], splat (double 5.000000e+00)
; CHECK-NEXT: [[TMP30:%.*]] = fmul <2 x double> [[WIDE_LOAD13]], splat (double 5.000000e+00)
; CHECK-NEXT: [[TMP31:%.*]] = fmul <2 x double> [[WIDE_LOAD14]], splat (double 5.000000e+00)
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP33:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]]
; CHECK-NEXT: [[TMP34:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP21]]
+; CHECK-NEXT: [[TMP38:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]]
; CHECK-NEXT: [[TMP35:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP22]]
-; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <2 x double> [[TMP28]], <2 x double> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP36]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP32]], align 8
-; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <2 x double> [[TMP29]], <2 x double> [[TMP29]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC15:%.*]] = shufflevector <4 x double> [[TMP37]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP33]], align 8
-; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <2 x double> [[TMP30]], <2 x double> [[TMP30]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC16:%.*]] = shufflevector <4 x double> [[TMP38]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP34]], align 8
-; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> [[TMP31]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC17:%.*]] = shufflevector <4 x double> [[TMP39]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP35]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP28]], ptr [[TMP32]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP29]], ptr [[TMP34]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP30]], ptr [[TMP38]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP31]], ptr [[TMP35]], align 8
; CHECK-NEXT: [[TMP40:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP41:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP20]]
; CHECK-NEXT: [[TMP42:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP21]]
+; CHECK-NEXT: [[TMP41:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP20]]
; CHECK-NEXT: [[TMP43:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP22]]
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP40]], align 8
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP41]], align 8
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP42]], align 8
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP43]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: store <2 x double> [[TMP28]], ptr [[TMP40]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP29]], ptr [[TMP42]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP30]], ptr [[TMP41]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP31]], ptr [[TMP43]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
index a4173cb2781c1..d3f051af06f50 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
@@ -13,12 +13,8 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) {
; VF2: [[VECTOR_PH]]:
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2: [[VECTOR_BODY]]:
-; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[DATA]], align 8
-; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[DATA]], align 8
+; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[DATA]], align 8
+; VF2-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[DATA]], align 8
; VF2-NEXT: br label %[[MIDDLE_BLOCK:.*]]
; VF2: [[MIDDLE_BLOCK]]:
; VF2-NEXT: br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
index ee5f1929f41e4..0955adfbde389 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
@@ -75,32 +75,26 @@ define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) {
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 2
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP7:%.*]] = shl nsw i64 [[TMP0]], 1
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP11:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
; CHECK-NEXT: [[TMP15:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[STRIDED_VEC2]]
; CHECK-NEXT: [[TMP16:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[STRIDED_VEC5]]
-; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC6:%.*]] = shufflevector <4 x i64> [[TMP18]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC6]], ptr [[TMP9]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: store <2 x i64> [[TMP15]], ptr [[TMP8]], align 8
+; CHECK-NEXT: store <2 x i64> [[TMP16]], ptr [[TMP9]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
index 41b89dc0a4df6..bc7d862c01c95 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
@@ -328,10 +328,8 @@ define void @same_live_in_store_interleave_group(i64 %x, ptr noalias %dst) {
; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
-; VF2-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> [[BROADCAST_SPLAT]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index 2a3ce037e9567..49a68e1160ca0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -8,15 +8,70 @@ target triple = "x86_64-unknown-linux"
define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
; CHECK-LABEL: define void @test_4xi64(
; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ITER_CHECK:.*]]:
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: [[N_MOD_VF1:%.*]] = urem i64 [[N]], 16
+; CHECK-NEXT: [[N_VEC1:%.*]] = sub i64 [[N]], [[N_MOD_VF1]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP20]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP21]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT5]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP22]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[INDEX]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP23]], align 8
+; CHECK-NEXT: [[TMP15:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT1]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP16:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT6]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP17:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT8]], [[WIDE_LOAD3]]
+; CHECK-NEXT: [[TMP18:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT10]], [[WIDE_LOAD4]]
+; CHECK-NEXT: store <4 x i64> [[TMP15]], ptr [[TMP11]], align 8
+; CHECK-NEXT: store <4 x i64> [[TMP16]], ptr [[TMP12]], align 8
+; CHECK-NEXT: store <4 x i64> [[TMP17]], ptr [[TMP13]], align 8
+; CHECK-NEXT: store <4 x i64> [[TMP18]], ptr [[TMP23]], align 8
+; CHECK-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC1]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 [[N]], [[N_VEC1]]
+; CHECK-NEXT: br i1 [[CMP_N1]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC1]]
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK: [[VEC_EPILOG_PH]]:
+; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
@@ -27,15 +82,15 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[TMP3]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 1
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br i1 [[TMP14]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
@@ -56,7 +111,7 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store i64 [[MUL_3]], ptr [[DATA_3]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -117,7 +172,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -140,7 +195,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -195,7 +250,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -218,7 +273,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -273,7 +328,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -296,7 +351,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
; CHECK-NEXT: store i64 [[MUL_0]], ptr [[DATA_1]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -351,7 +406,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -374,7 +429,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_0]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -435,7 +490,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
@@ -459,7 +514,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -519,7 +574,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -544,7 +599,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store i64 [[MUL_2]], ptr [[DATA_2]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -653,7 +708,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
@@ -677,7 +732,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store i32 [[MUL_2]], ptr [[DATA_2]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -711,19 +766,20 @@ exit:
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
-; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
-; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
-; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META2]], [[META1]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
+; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
;.
>From 4f7770e6bf942ad268aad46d76d47866826a59ad Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 2 Sep 2025 21:28:40 +0100
Subject: [PATCH 2/2] !fixup move getting vector bitwidth to transform.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 9 +--------
.../Transforms/Vectorize/VPlanTransforms.cpp | 20 +++++++++++++------
.../Transforms/Vectorize/VPlanTransforms.h | 2 +-
3 files changed, 16 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f94764ceb77c4..86b8da2aa5c21 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7234,9 +7234,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
VPlanTransforms::simplifyRecipes(BestVPlan);
VPlanTransforms::removeBranchOnConst(BestVPlan);
- VPlanTransforms::narrowInterleaveGroups(
- BestVPlan, BestVF,
- TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
VPlanTransforms::cse(BestVPlan);
VPlanTransforms::removeDeadRecipes(BestVPlan);
@@ -8315,11 +8312,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
*Plan, CM.getMaxSafeElements());
- if (auto P = VPlanTransforms::narrowInterleaveGroups(
- *Plan,
- TTI.getRegisterBitWidth(
- TargetTransformInfo::RGK_FixedWidthVector),
- SubRange))
+ if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI))
VPlans.push_back(std::move(P));
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c1060abef0b49..f4d6c8442d8e8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3878,14 +3878,22 @@ static bool isAlreadyNarrow(VPValue *VPV) {
}
std::unique_ptr<VPlan>
-VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, unsigned VectorRegWidth,
- VFRange &Range) {
+VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
+ const TargetTransformInfo &TTI) {
using namespace llvm::VPlanPatternMatch;
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
if (!VectorLoop)
return nullptr;
+ auto GetVectorWidthForVF = [&TTI](ElementCount VF) {
+ return TTI
+ .getRegisterBitWidth(VF.isFixed()
+ ? TargetTransformInfo::RGK_FixedWidthVector
+ : TargetTransformInfo::RGK_ScalableVector)
+ .getKnownMinValue();
+ };
+
VPTypeAnalysis TypeInfo(Plan);
SmallVector<VPInterleaveRecipe *> StoreGroups;
std::optional<ElementCount> VFToOptimize;
@@ -3920,14 +3928,14 @@ VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, unsigned VectorRegWidth,
// suitable VF across the Plans VFs.
//
if (VFToOptimize) {
- if (!isConsecutiveInterleaveGroup(InterleaveR,
- VFToOptimize->getKnownMinValue(),
- TypeInfo, VectorRegWidth))
+ if (!isConsecutiveInterleaveGroup(
+ InterleaveR, VFToOptimize->getKnownMinValue(), TypeInfo,
+ GetVectorWidthForVF(*VFToOptimize)))
return nullptr;
} else {
for (ElementCount VF : Plan.vectorFactors()) {
if (isConsecutiveInterleaveGroup(InterleaveR, VF.getKnownMinValue(),
- TypeInfo, VectorRegWidth)) {
+ TypeInfo, GetVectorWidthForVF(VF))) {
VFToOptimize = VF;
break;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 22e031c8cc06f..6483210529a44 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -345,7 +345,7 @@ struct VPlanTransforms {
/// This effectively is a very simple form of loop-aware SLP, where we use
/// interleave groups to identify candidates.
static std::unique_ptr<VPlan>
- narrowInterleaveGroups(VPlan &Plan, unsigned VectorRegWidth, VFRange &Range);
+ narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI);
/// Predicate and linearize the control-flow in the only loop region of
/// \p Plan. If \p FoldTail is true, create a mask guarding the loop
More information about the llvm-commits
mailing list