[llvm] 6f253e8 - Reapply "[VPlan] Run narrowInterleaveGroups during general VPlan optimizations. (#149706)"
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Sun Feb 15 12:10:39 PST 2026
Author: Florian Hahn
Date: 2026-02-15T20:10:10Z
New Revision: 6f253e87ddac3c3fffd71c6c5ccfa457096bc191
URL: https://github.com/llvm/llvm-project/commit/6f253e87ddac3c3fffd71c6c5ccfa457096bc191
DIFF: https://github.com/llvm/llvm-project/commit/6f253e87ddac3c3fffd71c6c5ccfa457096bc191.diff
LOG: Reapply "[VPlan] Run narrowInterleaveGroups during general VPlan optimizations. (#149706)"
This reverts commit 8d29d09309654541fb2861524276ada6a3ebf84c.
The underlying issue causing the revert has been fixed independently
as 301fa24671256734df6b7ee65f23ad885400108e.
Original message:
Move narrowInterleaveGroups to to general VPlan optimization stage.
To do so, narrowInterleaveGroups now has to find a suitable VF where all
interleave groups are consecutive and saturate the full vector width.
If such a VF is found, the original VPlan is split into 2:
a) a new clone which contains all VFs of Plan, except VFToOptimize, and
b) the original Plan with VFToOptimize as single VF.
The original Plan is then optimized. If a new copy for the other VFs has
been created, it is returned and the caller has to add it to the list of
candidate plans.
Together with https://github.com/llvm/llvm-project/pull/149702, this
allows to take the narrowed interleave groups into account when
computing costs to choose the best VF and interleave count.
One example where we currently miss interleaving/unrolling when
narrowing interleave groups is https://godbolt.org/z/Yz77zbacz
PR: https://github.com/llvm/llvm-project/pull/149706
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/VPlan.h
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
llvm/lib/Transforms/Vectorize/VPlanTransforms.h
llvm/test/CodeGen/WebAssembly/memory-interleave.ll
llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a1971683cfdf6..72400e1055427 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7445,11 +7445,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
return DenseMap<const SCEV *, Value *>();
}
- VPlanTransforms::narrowInterleaveGroups(
- BestVPlan, BestVF,
- TTI.getRegisterBitWidth(BestVF.isScalable()
- ? TargetTransformInfo::RGK_ScalableVector
- : TargetTransformInfo::RGK_FixedWidthVector));
VPlanTransforms::removeDeadRecipes(BestVPlan);
VPlanTransforms::convertToConcreteRecipes(BestVPlan);
@@ -8149,6 +8144,10 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
CM.getMaxSafeElements());
RUN_VPLAN_PASS(VPlanTransforms::optimizeEVLMasks, *Plan);
}
+
+ if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI))
+ VPlans.push_back(std::move(P));
+
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 7f958461f0ec9..bb36659cdba6c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4679,6 +4679,12 @@ class VPlan {
VFs.insert(VF);
}
+ /// Remove \p VF from the plan.
+ void removeVF(ElementCount VF) {
+ assert(hasVF(VF) && "tried to remove VF not present in plan");
+ VFs.remove(VF);
+ }
+
bool hasVF(ElementCount VF) const { return VFs.count(VF); }
bool hasScalableVF() const {
return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); });
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 174e428d05c62..2f9f5df5a38d4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5157,15 +5157,14 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
return false;
}
-/// Returns true if \p IR is a full interleave group with factor and number of
-/// members both equal to \p VF. The interleave group must also access the full
-/// vector width \p VectorRegWidth.
-static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
- ElementCount VF,
- VPTypeAnalysis &TypeInfo,
- TypeSize VectorRegWidth) {
+/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
+/// number of members both equal to VF. The interleave group must also access
+/// the full vector width.
+static std::optional<ElementCount> isConsecutiveInterleaveGroup(
+ VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs,
+ VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
if (!InterleaveR || InterleaveR->getMask())
- return false;
+ return std::nullopt;
Type *GroupElementTy = nullptr;
if (InterleaveR->getStoredValues().empty()) {
@@ -5174,7 +5173,7 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
[&TypeInfo, GroupElementTy](VPValue *Op) {
return TypeInfo.inferScalarType(Op) == GroupElementTy;
}))
- return false;
+ return std::nullopt;
} else {
GroupElementTy =
TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
@@ -5182,15 +5181,29 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
[&TypeInfo, GroupElementTy](VPValue *Op) {
return TypeInfo.inferScalarType(Op) == GroupElementTy;
}))
- return false;
+ return std::nullopt;
}
- unsigned VFMin = VF.getKnownMinValue();
- TypeSize GroupSize = TypeSize::get(
- GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable());
- const auto *IG = InterleaveR->getInterleaveGroup();
- return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
- GroupSize == VectorRegWidth;
+ auto IG = InterleaveR->getInterleaveGroup();
+ if (IG->getFactor() != IG->getNumMembers())
+ return std::nullopt;
+
+ auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
+ TypeSize Size = TTI.getRegisterBitWidth(
+ VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector
+ : TargetTransformInfo::RGK_ScalableVector);
+ assert(Size.isScalable() == VF.isScalable() &&
+ "if Size is scalable, VF must be scalable and vice versa");
+ return Size.getKnownMinValue();
+ };
+
+ for (ElementCount VF : VFs) {
+ unsigned MinVal = VF.getKnownMinValue();
+ unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
+ if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
+ return {VF};
+ }
+ return std::nullopt;
}
/// Returns true if \p VPValue is a narrow VPValue.
@@ -5254,16 +5267,22 @@ narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
return N;
}
-void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
- TypeSize VectorRegWidth) {
+std::unique_ptr<VPlan>
+VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
+ const TargetTransformInfo &TTI) {
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
- if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
- return;
+
+ if (!VectorLoop)
+ return nullptr;
+
+ // Only handle single-block loops for now.
+ if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
+ return nullptr;
// Skip plans when we may not be able to properly narrow.
VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
if (!match(&Exiting->back(), m_BranchOnCount()))
- return;
+ return nullptr;
assert(match(&Exiting->back(),
m_BranchOnCount(m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())),
@@ -5271,8 +5290,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
"unexpected branch-on-count");
VPTypeAnalysis TypeInfo(Plan);
-
SmallVector<VPInterleaveRecipe *> StoreGroups;
+ std::optional<ElementCount> VFToOptimize;
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
if (isa<VPCanonicalIVPHIRecipe>(&R))
continue;
@@ -5286,29 +5305,29 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// * recipes writing to memory except interleave groups
// Only support plans with a canonical induction phi.
if (R.isPhi())
- return;
+ return nullptr;
auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
if (R.mayWriteToMemory() && !InterleaveR)
- return;
-
- // Do not narrow interleave groups if there are VectorPointer recipes and
- // the plan was unrolled. The recipe implicitly uses VF from
- // VPTransformState.
- // TODO: Remove restriction once the VF for the VectorPointer offset is
- // modeled explicitly as operand.
- if (isa<VPVectorPointerRecipe>(&R) && Plan.getConcreteUF() > 1)
- return;
+ return nullptr;
// All other ops are allowed, but we reject uses that cannot be converted
// when checking all allowed consumers (store interleave groups) below.
if (!InterleaveR)
continue;
- // Bail out on non-consecutive interleave groups.
- if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
- VectorRegWidth))
- return;
+ // Try to find a single VF, where all interleave groups are consecutive and
+ // saturate the full vector width. If we already have a candidate VF, check
+ // if it is applicable for the current InterleaveR, otherwise look for a
+ // suitable VF across the Plan's VFs.
+ SmallVector<ElementCount> VFs =
+ VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
+ : to_vector(Plan.vectorFactors());
+ std::optional<ElementCount> NarrowedVF =
+ isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
+ if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
+ return nullptr;
+ VFToOptimize = NarrowedVF;
// Skip read interleave groups.
if (InterleaveR->getStoredValues().empty())
@@ -5342,24 +5361,35 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
auto *WideMember0 =
dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]);
if (!WideMember0)
- return;
+ return nullptr;
for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
auto *R = dyn_cast_or_null<VPWidenRecipe>(V);
if (!R || R->getOpcode() != WideMember0->getOpcode() ||
R->getNumOperands() > 2)
- return;
+ return nullptr;
if (any_of(enumerate(R->operands()),
[WideMember0, Idx = I](const auto &P) {
const auto &[OpIdx, OpV] = P;
return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
}))
- return;
+ return nullptr;
}
StoreGroups.push_back(InterleaveR);
}
if (StoreGroups.empty())
- return;
+ return nullptr;
+
+ // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
+ // original Plan into 2: a) a new clone which contains all VFs of Plan, except
+ // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
+ // TODO: Handle cases where only some interleave groups can be narrowed.
+ std::unique_ptr<VPlan> NewPlan;
+ if (size(Plan.vectorFactors()) != 1) {
+ NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
+ Plan.setVF(*VFToOptimize);
+ NewPlan->removeVF(*VFToOptimize);
+ }
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
SmallPtrSet<VPValue *, 4> NarrowedOps;
@@ -5382,9 +5412,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
VPBuilder PHBuilder(Plan.getVectorPreheader());
- VPValue *UF = Plan.getConstantInt(VectorLoop->getCanonicalIVType(),
- Plan.getConcreteUF());
- if (VF.isScalable()) {
+ VPValue *UF = &Plan.getUF();
+ if (VFToOptimize->isScalable()) {
VPValue *VScale = PHBuilder.createElementCount(
VectorLoop->getCanonicalIVType(), ElementCount::getScalable(1));
VPValue *VScaleUF = PHBuilder.createOverflowingOp(
@@ -5397,6 +5426,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
Plan.getConstantInt(CanIV->getScalarType(), 1));
}
removeDeadRecipes(Plan);
+ assert(none_of(*VectorLoop->getEntryBasicBlock(),
+ IsaPred<VPVectorPointerRecipe>) &&
+ "All VPVectorPointerRecipes should have been removed");
+ return NewPlan;
}
/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 35597c95f26f2..b76fde2bd1217 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -425,14 +425,20 @@ struct VPlanTransforms {
static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
ScalarEvolution &SE);
- /// Try to convert a plan with interleave groups with VF elements to a plan
- /// with the interleave groups replaced by wide loads and stores processing VF
- /// elements, if all transformed interleave groups access the full vector
- /// width (checked via \o VectorRegWidth). This effectively is a very simple
- /// form of loop-aware SLP, where we use interleave groups to identify
- /// candidates.
- static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
- TypeSize VectorRegWidth);
+ /// Try to find a single VF among \p Plan's VFs for which all interleave
+ /// groups (with known minimum VF elements) can be replaced by wide loads and
+ /// stores processing VF elements, if all transformed interleave groups access
+ /// the full vector width (checked via the maximum vector register width). If
+ /// the transformation can be applied, the original \p Plan will be split in
+ /// 2:
+ /// 1. The original Plan with the single VF containing the optimized recipes
+ /// using wide loads instead of interleave groups.
+ /// 2. A new clone which contains all VFs of Plan except the optimized VF.
+ ///
+ /// This effectively is a very simple form of loop-aware SLP, where we use
+ /// interleave groups to identify candidates.
+ static std::unique_ptr<VPlan>
+ narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI);
/// Predicate and linearize the control-flow in the only loop region of
/// \p Plan. If \p FoldTail is true, create a mask guarding the loop
diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
index 999099d953f9d..e4d36ce63c26e 100644
--- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -2003,7 +2003,9 @@ for.body: ; preds = %entry, %for.body
; CHECK-LABEL: four_floats_same_op:
; CHECK: loop
-; CHECK-NOT: v128.load
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: v128.store
define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp45.not = icmp eq i32 %N, 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
index 005ca8c9b2d93..52bd8a0a11e35 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
@@ -28,8 +28,9 @@ define void @test_add_double_same_const_args_1(ptr %res, ptr noalias %A, ptr noa
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[EXIT:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
;
entry:
br label %loop
@@ -76,10 +77,11 @@ define void @test_add_double_same_const_args_2(ptr %res, ptr noalias %A, ptr noa
; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[TMP9]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[EXIT:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
;
entry:
br label %loop
@@ -136,10 +138,11 @@ define void @test_add_double_mixed_const_args(ptr %res, ptr noalias %A, ptr noal
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP9]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[EXIT:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
;
entry:
br label %loop
@@ -175,33 +178,24 @@ define void @test_add_double_same_var_args_1(ptr %res, ptr noalias %A, ptr noali
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[EXIT:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
;
entry:
br label %loop
@@ -237,33 +231,24 @@ define void @test_add_double_same_var_args_2(ptr %res, ptr noalias %A, ptr noali
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
-; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC3]]
+; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC1]]
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[EXIT:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
;
entry:
br label %loop
@@ -322,10 +307,11 @@ define void @test_add_double_same_var_args_at_
diff erent_positions(ptr %res, ptr
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[EXIT:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
;
entry:
br label %loop
@@ -386,10 +372,11 @@ define void @test_add_double_
diff erent_var_args_1(ptr %res, ptr noalias %A, ptr
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC7]], ptr [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[EXIT:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
;
entry:
br label %loop
@@ -450,10 +437,11 @@ define void @test_add_double_
diff erent_var_args_2(ptr %res, ptr noalias %A, ptr
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC7]], ptr [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[EXIT:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
;
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
index 6eb8242bf7975..5e37f9eff4ba2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
@@ -319,46 +319,46 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 3
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i64 2
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i64 4
-; CHECK-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i64 6
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
-; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP25]], align 8
-; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP26]], align 8
-; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <2 x double>, ptr [[TMP27]], align 8
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP21]]
+; CHECK-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP20]]
+; CHECK-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP22]]
+; CHECK-NEXT: [[TMP39:%.*]] = load double, ptr [[TMP23]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[TMP39]], i64 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr [[TMP27]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <2 x double> poison, double [[TMP24]], i64 0
+; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT12]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP25:%.*]] = load double, ptr [[TMP36]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <2 x double> poison, double [[TMP25]], i64 0
+; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT14]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP26:%.*]] = load double, ptr [[TMP38]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <2 x double> poison, double [[TMP26]], i64 0
+; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT16]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP28:%.*]] = fmul <2 x double> [[WIDE_LOAD]], splat (double 5.000000e+00)
; CHECK-NEXT: [[TMP29:%.*]] = fmul <2 x double> [[WIDE_LOAD12]], splat (double 5.000000e+00)
; CHECK-NEXT: [[TMP30:%.*]] = fmul <2 x double> [[WIDE_LOAD13]], splat (double 5.000000e+00)
; CHECK-NEXT: [[TMP31:%.*]] = fmul <2 x double> [[WIDE_LOAD14]], splat (double 5.000000e+00)
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP33:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]]
; CHECK-NEXT: [[TMP34:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP21]]
+; CHECK-NEXT: [[TMP33:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]]
; CHECK-NEXT: [[TMP35:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP22]]
-; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <2 x double> [[TMP28]], <2 x double> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP36]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP32]], align 8
-; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <2 x double> [[TMP29]], <2 x double> [[TMP29]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC15:%.*]] = shufflevector <4 x double> [[TMP37]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP33]], align 8
-; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <2 x double> [[TMP30]], <2 x double> [[TMP30]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC16:%.*]] = shufflevector <4 x double> [[TMP38]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP34]], align 8
-; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> [[TMP31]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC17:%.*]] = shufflevector <4 x double> [[TMP39]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP35]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP28]], ptr [[TMP32]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP29]], ptr [[TMP34]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP30]], ptr [[TMP33]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP31]], ptr [[TMP35]], align 8
; CHECK-NEXT: [[TMP40:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP41:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP20]]
; CHECK-NEXT: [[TMP42:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP21]]
+; CHECK-NEXT: [[TMP37:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP20]]
; CHECK-NEXT: [[TMP43:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP22]]
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP40]], align 8
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP41]], align 8
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP42]], align 8
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP43]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: store <2 x double> [[TMP28]], ptr [[TMP40]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP29]], ptr [[TMP42]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP30]], ptr [[TMP37]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP31]], ptr [[TMP43]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
@@ -435,7 +435,7 @@ exit:
ret void
}
-; FIXME: We should interleave by 2 after narrowing interleave groups to saturate
+; We interleave by 2 after narrowing interleave groups to saturate
; load/store units.
define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) {
; CHECK-LABEL: define void @test_interleave_after_narrowing(
@@ -447,12 +447,18 @@ define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) {
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP5]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP4:%.*]] = fneg <4 x float> [[WIDE_LOAD1]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP5]]
; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[TMP2]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; CHECK-NEXT: store <4 x float> [[TMP4]], ptr [[TMP6]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
index c26176028626b..2865495954140 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
@@ -13,12 +13,8 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) {
; VF2: [[VECTOR_PH]]:
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2: [[VECTOR_BODY]]:
-; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[DATA]], align 8
-; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[DATA]], align 8
+; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[DATA]], align 8
+; VF2-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[DATA]], align 8
; VF2-NEXT: br label %[[MIDDLE_BLOCK:.*]]
; VF2: [[MIDDLE_BLOCK]]:
; VF2-NEXT: br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
index b14b1783c97e3..22291203b8c71 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
@@ -60,32 +60,26 @@ define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) {
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i64 2
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP1]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP7:%.*]] = shl nsw i64 [[TMP0]], 1
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP11:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
; CHECK-NEXT: [[TMP15:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[STRIDED_VEC2]]
; CHECK-NEXT: [[TMP16:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[STRIDED_VEC5]]
-; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC6:%.*]] = shufflevector <4 x i64> [[TMP18]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC6]], ptr [[TMP9]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: store <2 x i64> [[TMP15]], ptr [[TMP8]], align 8
+; CHECK-NEXT: store <2 x i64> [[TMP16]], ptr [[TMP9]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
index 75980ba1189cd..5c9a3eac44fa0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
@@ -22,8 +22,9 @@ define void @load_store_interleave_group(ptr noalias %data) {
; VF2-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
-; VF2-NEXT: br [[EXIT:label %.*]]
-; VF2: [[SCALAR_PH:.*:]]
+; VF2-NEXT: br label %[[EXIT:.*]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
;
; VF4-LABEL: define void @load_store_interleave_group(
; VF4-SAME: ptr noalias [[DATA:%.*]]) {
@@ -45,8 +46,9 @@ define void @load_store_interleave_group(ptr noalias %data) {
; VF4-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF4-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; VF4: [[MIDDLE_BLOCK]]:
-; VF4-NEXT: br [[EXIT:label %.*]]
-; VF4: [[SCALAR_PH:.*:]]
+; VF4-NEXT: br label %[[EXIT:.*]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
;
entry:
br label %loop
@@ -87,8 +89,9 @@ define void @load_store_interleave_group_
diff erent_objecs(ptr noalias %src, ptr
; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
-; VF2-NEXT: br [[EXIT:label %.*]]
-; VF2: [[SCALAR_PH:.*:]]
+; VF2-NEXT: br label %[[EXIT:.*]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
;
; VF4-LABEL: define void @load_store_interleave_group_
diff erent_objecs(
; VF4-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
@@ -111,8 +114,9 @@ define void @load_store_interleave_group_
diff erent_objecs(ptr noalias %src, ptr
; VF4-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF4-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; VF4: [[MIDDLE_BLOCK]]:
-; VF4-NEXT: br [[EXIT:label %.*]]
-; VF4: [[SCALAR_PH:.*:]]
+; VF4-NEXT: br label %[[EXIT:.*]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
;
entry:
br label %loop
@@ -157,8 +161,9 @@ define void @single_wide_load_store_interleave_group(ptr noalias %src, ptr noali
; VF2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
-; VF2-NEXT: br [[EXIT:label %.*]]
-; VF2: [[SCALAR_PH:.*:]]
+; VF2-NEXT: br label %[[EXIT:.*]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
;
; VF4-LABEL: define void @single_wide_load_store_interleave_group(
; VF4-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
@@ -179,8 +184,9 @@ define void @single_wide_load_store_interleave_group(ptr noalias %src, ptr noali
; VF4-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF4-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; VF4: [[MIDDLE_BLOCK]]:
-; VF4-NEXT: br [[EXIT:label %.*]]
-; VF4: [[SCALAR_PH:.*:]]
+; VF4-NEXT: br label %[[EXIT:.*]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
;
entry:
br label %loop
@@ -219,8 +225,9 @@ define void @same_constant_store_interleave_group(i64 %x, ptr noalias %dst) {
; VF2-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
-; VF2-NEXT: br [[EXIT:label %.*]]
-; VF2: [[SCALAR_PH:.*:]]
+; VF2-NEXT: br label %[[EXIT:.*]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
;
; VF4-LABEL: define void @same_constant_store_interleave_group(
; VF4-SAME: i64 [[X:%.*]], ptr noalias [[DST:%.*]]) {
@@ -237,8 +244,9 @@ define void @same_constant_store_interleave_group(i64 %x, ptr noalias %dst) {
; VF4-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF4-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; VF4: [[MIDDLE_BLOCK]]:
-; VF4-NEXT: br [[EXIT:label %.*]]
-; VF4: [[SCALAR_PH:.*:]]
+; VF4-NEXT: br label %[[EXIT:.*]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
;
entry:
br label %loop
@@ -275,8 +283,9 @@ define void @
diff erent_constants_store_interleave_group(i64 %x, i64 %y, ptr noal
; VF2-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
-; VF2-NEXT: br [[EXIT:label %.*]]
-; VF2: [[SCALAR_PH:.*:]]
+; VF2-NEXT: br label %[[EXIT:.*]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
;
; VF4-LABEL: define void @
diff erent_constants_store_interleave_group(
; VF4-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr noalias [[DST:%.*]]) {
@@ -293,8 +302,9 @@ define void @
diff erent_constants_store_interleave_group(i64 %x, i64 %y, ptr noal
; VF4-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF4-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; VF4: [[MIDDLE_BLOCK]]:
-; VF4-NEXT: br [[EXIT:label %.*]]
-; VF4: [[SCALAR_PH:.*:]]
+; VF4-NEXT: br label %[[EXIT:.*]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
;
entry:
br label %loop
@@ -328,15 +338,14 @@ define void @same_live_in_store_interleave_group(i64 %x, ptr noalias %dst) {
; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
-; VF2-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> [[BROADCAST_SPLAT]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
-; VF2-NEXT: br [[EXIT:label %.*]]
-; VF2: [[SCALAR_PH:.*:]]
+; VF2-NEXT: br label %[[EXIT:.*]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
;
; VF4-LABEL: define void @same_live_in_store_interleave_group(
; VF4-SAME: i64 [[X:%.*]], ptr noalias [[DST:%.*]]) {
@@ -357,8 +366,9 @@ define void @same_live_in_store_interleave_group(i64 %x, ptr noalias %dst) {
; VF4-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF4-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; VF4: [[MIDDLE_BLOCK]]:
-; VF4-NEXT: br [[EXIT:label %.*]]
-; VF4: [[SCALAR_PH:.*:]]
+; VF4-NEXT: br label %[[EXIT:.*]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
;
entry:
br label %loop
@@ -401,8 +411,9 @@ define void @
diff erent_live_ins_store_interleave_group(i64 %x, i64 %y, ptr noali
; VF2-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
-; VF2-NEXT: br [[EXIT:label %.*]]
-; VF2: [[SCALAR_PH:.*:]]
+; VF2-NEXT: br label %[[EXIT:.*]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
;
; VF4-LABEL: define void @
diff erent_live_ins_store_interleave_group(
; VF4-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr noalias [[DST:%.*]]) {
@@ -425,8 +436,9 @@ define void @
diff erent_live_ins_store_interleave_group(i64 %x, i64 %y, ptr noali
; VF4-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF4-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; VF4: [[MIDDLE_BLOCK]]:
-; VF4-NEXT: br [[EXIT:label %.*]]
-; VF4: [[SCALAR_PH:.*:]]
+; VF4-NEXT: br label %[[EXIT:.*]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
;
entry:
br label %loop
@@ -466,8 +478,9 @@ define void @single_uniform_load_store_interleave_group(ptr noalias %src, ptr no
; VF2-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
-; VF2-NEXT: br [[EXIT:label %.*]]
-; VF2: [[SCALAR_PH:.*:]]
+; VF2-NEXT: br label %[[EXIT:.*]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
;
; VF4-LABEL: define void @single_uniform_load_store_interleave_group(
; VF4-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
@@ -489,8 +502,9 @@ define void @single_uniform_load_store_interleave_group(ptr noalias %src, ptr no
; VF4-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF4-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; VF4: [[MIDDLE_BLOCK]]:
-; VF4-NEXT: br [[EXIT:label %.*]]
-; VF4: [[SCALAR_PH:.*:]]
+; VF4-NEXT: br label %[[EXIT:.*]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
;
entry:
br label %loop
@@ -536,8 +550,9 @@ define void @multiple_uniform_load_store_interleave_group(ptr noalias %src.0, pt
; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
-; VF2-NEXT: br [[EXIT:label %.*]]
-; VF2: [[SCALAR_PH:.*:]]
+; VF2-NEXT: br label %[[EXIT:.*]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
;
; VF4-LABEL: define void @multiple_uniform_load_store_interleave_group(
; VF4-SAME: ptr noalias [[SRC_0:%.*]], ptr noalias [[SRC_1:%.*]], ptr noalias [[DST:%.*]]) {
@@ -562,8 +577,9 @@ define void @multiple_uniform_load_store_interleave_group(ptr noalias %src.0, pt
; VF4-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF4-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; VF4: [[MIDDLE_BLOCK]]:
-; VF4-NEXT: br [[EXIT:label %.*]]
-; VF4: [[SCALAR_PH:.*:]]
+; VF4-NEXT: br label %[[EXIT:.*]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
;
entry:
br label %loop
@@ -606,8 +622,9 @@ define void @multiple_store_groups_storing_same_load_group(ptr noalias %A, ptr n
; VF2-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; VF2-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
-; VF2-NEXT: br [[EXIT:label %.*]]
-; VF2: [[SCALAR_PH:.*:]]
+; VF2-NEXT: br label %[[EXIT:.*]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
;
; VF4-LABEL: define void @multiple_store_groups_storing_same_load_group(
; VF4-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
@@ -631,8 +648,9 @@ define void @multiple_store_groups_storing_same_load_group(ptr noalias %A, ptr n
; VF4-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; VF4-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; VF4: [[MIDDLE_BLOCK]]:
-; VF4-NEXT: br [[EXIT:label %.*]]
-; VF4: [[SCALAR_PH:.*:]]
+; VF4-NEXT: br label %[[EXIT:.*]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
;
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
index e60268fe6a087..54cbab78b1e29 100644
--- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
@@ -624,7 +624,7 @@ define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writ
; CHECK-LABEL: four_bytes_same_op
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
-; CHECK: Cost of 26 for VF 8: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 26 for VF 8: INTERLEAVE-GROUP with factor 4
; CHECK: Cost of 132 for VF 16: INTERLEAVE-GROUP with factor 4
; CHECK: LV: Scalar loop costs: 20.
; CHECK: LV: Vector loop of width 2 costs: 40.
@@ -685,7 +685,7 @@ define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %
; CHECK-LABEL: four_bytes_split_op
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
-; CHECK: Cost of 26 for VF 8: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 26 for VF 8: INTERLEAVE-GROUP with factor 4
; CHECK: Cost of 132 for VF 16: INTERLEAVE-GROUP with factor 4
; CHECK: LV: Scalar loop costs: 20.
; CHECK: LV: Vector loop of width 2 costs: 45.
@@ -750,7 +750,7 @@ define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly
; CHECK-LABEL: four_bytes_interleave_op
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
-; CHECK: Cost of 26 for VF 8: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 26 for VF 8: INTERLEAVE-GROUP with factor 4
; CHECK: Cost of 132 for VF 16: INTERLEAVE-GROUP with factor 4
; CHECK: LV: Scalar loop costs: 20.
; CHECK: LV: Vector loop of width 2 costs: 40
@@ -1771,7 +1771,7 @@ for.body: ; preds = %entry, %for.body
; CHECK: LV: Scalar loop costs: 24
; CHECK: LV: Vector loop of width 2 costs: 33
; CHECK: LV: Vector loop of width 4 costs: 30
-; CHECK: LV: Selecting VF: 1
+; CHECK: LV: Selecting VF: 4
define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp45.not = icmp eq i32 %N, 0
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index 2a3ce037e9567..cfa601469464f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -12,11 +12,65 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]]
+; CHECK: [[VECTOR_PH1]]:
+; CHECK-NEXT: [[N_MOD_VF1:%.*]] = urem i64 [[N]], 16
+; CHECK-NEXT: [[N_VEC1:%.*]] = sub i64 [[N]], [[N_MOD_VF1]]
+; CHECK-NEXT: br label %[[VECTOR_BODY1:.*]]
+; CHECK: [[VECTOR_BODY1]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_BODY1]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP20]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP21]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT5]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP22]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[INDEX]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP23]], align 8
+; CHECK-NEXT: [[TMP15:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT1]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP16:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT6]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP17:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT8]], [[WIDE_LOAD3]]
+; CHECK-NEXT: [[TMP18:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT10]], [[WIDE_LOAD4]]
+; CHECK-NEXT: store <4 x i64> [[TMP15]], ptr [[TMP11]], align 8
+; CHECK-NEXT: store <4 x i64> [[TMP16]], ptr [[TMP12]], align 8
+; CHECK-NEXT: store <4 x i64> [[TMP17]], ptr [[TMP13]], align 8
+; CHECK-NEXT: store <4 x i64> [[TMP18]], ptr [[TMP23]], align 8
+; CHECK-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC1]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK1:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK1]]:
+; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 [[N]], [[N_VEC1]]
+; CHECK-NEXT: br i1 [[CMP_N1]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF1]], 4
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; CHECK: [[VEC_EPILOG_PH]]:
+; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_PH]] ]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
@@ -27,12 +81,12 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[TMP3]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 1
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -56,7 +110,7 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store i64 [[MUL_3]], ptr [[DATA_3]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -117,7 +171,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -140,7 +194,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -195,7 +249,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -218,7 +272,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -273,7 +327,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -296,7 +350,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
; CHECK-NEXT: store i64 [[MUL_0]], ptr [[DATA_1]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -351,7 +405,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -374,7 +428,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_0]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -435,7 +489,7 @@ define void @test_2xi64_
diff erent_loads_feeding_fmul(ptr noalias %data, ptr noal
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
@@ -459,7 +513,7 @@ define void @test_2xi64_
diff erent_loads_feeding_fmul(ptr noalias %data, ptr noal
; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -519,7 +573,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -544,7 +598,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store i64 [[MUL_2]], ptr [[DATA_2]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -653,7 +707,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
@@ -677,7 +731,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: store i32 [[MUL_2]], ptr [[DATA_2]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -711,7 +765,7 @@ exit:
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 4, i32 12}
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
@@ -726,4 +780,6 @@ exit:
; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
+; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]}
;.
More information about the llvm-commits
mailing list