[llvm] 577631f - Reapply "[VPlan] Add transformation to narrow interleave groups. (#106441)"
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 25 13:57:16 PDT 2025
Author: Florian Hahn
Date: 2025-03-25T20:57:10Z
New Revision: 577631f0a52884384786bf899951e3094ad7f69c
URL: https://github.com/llvm/llvm-project/commit/577631f0a52884384786bf899951e3094ad7f69c
DIFF: https://github.com/llvm/llvm-project/commit/577631f0a52884384786bf899951e3094ad7f69c.diff
LOG: Reapply "[VPlan] Add transformation to narrow interleave groups. (#106441)"
This reverts commit ff3e2ba9eb94217f3ad3525dc18b0c7b684e0abf.
The recommmitted version limits to transform to cases where no
interleaving is taking place, to avoid a mis-compile when interleaving.
Original commit message:
This patch adds a new narrowInterleaveGroups transfrom, which tries
convert a plan with interleave groups with VF elements to a plan that
instead replaces the interleave groups with wide loads and stores
processing VF elements.
This effectively is a very simple form of loop-aware SLP, where we
use interleave groups to identify candidates.
This initial version is quite restricted and hopefully serves as a
starting point for how to best model those kinds of transforms.
Depends on https://github.com/llvm/llvm-project/pull/106431.
Fixes https://github.com/llvm/llvm-project/issues/82936.
PR: https://github.com/llvm/llvm-project/pull/106441
Added:
Modified:
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a8b217d19b047..b77ae8e54c78d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2247,6 +2247,36 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
}
}
+/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
+/// converted to a narrower recipe. \p V is used by a wide recipe \p WideMember
+/// that feeds a store interleave group at index \p Idx, \p WideMember0 is the
+/// recipe feeding the same interleave group at index 0. A VPWidenLoadRecipe can
+/// be narrowed to an index-independent load if it feeds all wide ops at all
+/// indices (checked by via the operands of the wide recipe at lane0, \p
+/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
+/// is defined at \p Idx of a load interleave group.
+static bool canNarrowLoad(VPWidenRecipe *WideMember0, VPWidenRecipe *WideMember,
+ VPValue *V, unsigned Idx) {
+ auto *DefR = V->getDefiningRecipe();
+ if (!DefR)
+ return false;
+ if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
+ return !W->getMask() &&
+ all_of(zip(WideMember0->operands(), WideMember->operands()),
+ [V](const auto P) {
+ // V must be as at the same places in both WideMember0 and
+ // WideMember.
+ const auto &[WideMember0Op, WideMemberOp] = P;
+ return (WideMember0Op == V) == (WideMemberOp == V);
+ });
+
+ if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
+ return IR->getInterleaveGroup()->getFactor() ==
+ IR->getInterleaveGroup()->getNumMembers() &&
+ IR->getVPValue(Idx) == V;
+ return false;
+}
+
/// Returns true if \p IR is a full interleave group with factor and number of
/// members both equal to \p VF. The interleave group must also access the full
/// vector width \p VectorRegWidth.
@@ -2284,7 +2314,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
unsigned VectorRegWidth) {
using namespace llvm::VPlanPatternMatch;
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
- if (VF.isScalable() || !VectorLoop)
+ if (VF.isScalable() || !VectorLoop || Plan.getUF() != 1)
return;
VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
@@ -2309,6 +2339,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
if (R.mayWriteToMemory() && !InterleaveR)
return;
+ // All other ops are allowed, but we reject uses that cannot be converted
+ // when checking all allowed consumers (store interleave groups) below.
if (!InterleaveR)
continue;
@@ -2323,7 +2355,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// For now, we only support full interleave groups storing load interleave
// groups.
- if (!all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
+ if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
if (!DefR)
return false;
@@ -2333,7 +2365,25 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
IR->getInterleaveGroup()->getNumMembers() &&
IR->getVPValue(Op.index()) == Op.value();
})) {
+ StoreGroups.push_back(InterleaveR);
+ continue;
+ }
+
+ // Check if all values feeding InterleaveR are matching wide recipes, which
+ // operands that can be narrowed.
+ auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
+ InterleaveR->getStoredValues()[0]->getDefiningRecipe());
+ if (!WideMember0)
return;
+ for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
+ auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe());
+ if (!R || R->getOpcode() != WideMember0->getOpcode() ||
+ R->getNumOperands() > 2)
+ return;
+ if (any_of(R->operands(), [WideMember0, Idx = I, R](VPValue *V) {
+ return !canNarrowLoad(WideMember0, R, V, Idx);
+ }))
+ return;
}
StoreGroups.push_back(InterleaveR);
}
@@ -2341,23 +2391,41 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
if (StoreGroups.empty())
return;
- // Convert InterleaveGroup R to a single VPWidenLoadRecipe.
+ // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
auto NarrowOp = [](VPRecipeBase *R) -> VPValue * {
- auto *LoadGroup = cast<VPInterleaveRecipe>(R);
- // Narrow interleave group to wide load, as transformed VPlan will only
+ if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
+ // Narrow interleave group to wide load, as transformed VPlan will only
+ // process one original iteration.
+ auto *L = new VPWidenLoadRecipe(
+ *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
+ LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
+ /*Reverse=*/false, LoadGroup->getDebugLoc());
+ L->insertBefore(LoadGroup);
+ return L;
+ }
+
+ auto *WideLoad = cast<VPWidenLoadRecipe>(R);
+
+ // Narrow wide load to uniform scalar load, as transformed VPlan will only
// process one original iteration.
- auto *L = new VPWidenLoadRecipe(
- *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
- LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
- /*Reverse=*/false, LoadGroup->getDebugLoc());
- L->insertBefore(LoadGroup);
- return L;
+ auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(),
+ WideLoad->operands(), /*IsUniform*/ true);
+ N->insertBefore(WideLoad);
+ return N;
};
// Narrow operation tree rooted at store groups.
for (auto *StoreGroup : StoreGroups) {
- VPValue *Res =
- NarrowOp(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
+ VPValue *Res = nullptr;
+ if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(
+ StoreGroup->getStoredValues()[0]->getDefiningRecipe())) {
+ for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
+ WideMember0->setOperand(
+ Idx, NarrowOp(WideMember0->getOperand(Idx)->getDefiningRecipe()));
+ Res = WideMember0;
+ } else {
+ Res = NarrowOp(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
+ }
auto *S = new VPWidenStoreRecipe(
*cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
index 674a0fc5644c4..2db97045cf750 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
@@ -100,27 +100,27 @@ define void @test_complex_add_double(ptr %res, ptr noalias %A, ptr noalias %B, i
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP1]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP3]], align 4
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: [[WIDE_VEC5:%.*]] = load <4 x double>, ptr [[TMP4]], align 4
-; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <4 x double>, ptr [[TMP5]], align 4
-; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[STRIDED_VEC6]]
-; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[STRIDED_VEC9]]
-; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[STRIDED_VEC7]]
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[STRIDED_VEC10]]
+; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
+; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[STRIDED_VEC7]]
+; CHECK-NEXT: [[TMP15:%.*]] = fadd <2 x double> [[STRIDED_VEC5]], [[STRIDED_VEC11]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
index 9b40c1137c516..9e6d932d1a4fd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
@@ -19,11 +19,19 @@ define void @load_store_interleave_group(ptr noalias %data) {
; CHECK-NEXT: [[TMP3:%.*]] = shl nsw i64 [[TMP1]], 1
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; CHECK-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 8
-; CHECK-NEXT: store <2 x i64> [[WIDE_LOAD1]], ptr [[TMP5]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP5]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
index 051d85e6a488c..16b8907c8816a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5
-; RUN: opt -p loop-vectorize -force-vector-width=2 -S %s | FileCheck --check-prefixes=VF2 %s
-; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck --check-prefixes=VF4 %s
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF2 %s
+; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF4 %s
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macosx"
@@ -13,30 +13,13 @@ define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr no
; VF2: [[VECTOR_PH]]:
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2: [[VECTOR_BODY]]:
-; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VF2-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 2
-; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2-NEXT: [[TMP10:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[TMP12:%.*]] = shl nsw i64 [[TMP10]], 1
-; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP1]]
-; VF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP12]]
-; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 8
-; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP13]], align 8
-; VF2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT: [[TMP3:%.*]] = fneg <2 x double> [[STRIDED_VEC]]
-; VF2-NEXT: [[TMP14:%.*]] = fneg <2 x double> [[STRIDED_VEC3]]
-; VF2-NEXT: [[TMP4:%.*]] = fneg <2 x double> [[STRIDED_VEC1]]
-; VF2-NEXT: [[TMP9:%.*]] = fneg <2 x double> [[STRIDED_VEC4]]
-; VF2-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; VF2-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> [[TMP14]], <2 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP13]], align 8
-; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP12]]
+; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
+; VF2-NEXT: [[TMP11:%.*]] = fneg <2 x double> [[WIDE_LOAD1]]
+; VF2-NEXT: store <2 x double> [[TMP11]], ptr [[TMP5]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP10]], 1
; VF2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
@@ -99,34 +82,21 @@ define void @test_2xi64_unary_op_wide_load(ptr noalias %data, ptr noalias %A, pt
; VF2: [[VECTOR_PH]]:
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2: [[VECTOR_BODY]]:
-; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VF2-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 2
-; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2-NEXT: [[TMP14:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[TMP16:%.*]] = shl nsw i64 [[TMP14]], 1
-; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP0]]
+; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP14]]
; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 0
-; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 2
-; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
-; VF2-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
-; VF2-NEXT: [[TMP4:%.*]] = fneg <2 x double> [[WIDE_LOAD]]
+; VF2-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
; VF2-NEXT: [[TMP19:%.*]] = fneg <2 x double> [[WIDE_LOAD2]]
-; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP1]]
; VF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP16]]
-; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP14]]
; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 0
-; VF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 2
-; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
-; VF2-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
-; VF2-NEXT: [[TMP8:%.*]] = fneg <2 x double> [[WIDE_LOAD1]]
+; VF2-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
; VF2-NEXT: [[TMP15:%.*]] = fneg <2 x double> [[WIDE_LOAD3]]
-; VF2-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
; VF2-NEXT: [[TMP17:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; VF2-NEXT: [[INTERLEAVED_VEC4:%.*]] = shufflevector <4 x double> [[TMP17]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
; VF2-NEXT: store <4 x double> [[INTERLEAVED_VEC4]], ptr [[TMP20]], align 8
-; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP14]], 2
; VF2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
@@ -197,18 +167,15 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor) {
; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
+; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
+; VF2-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
; VF2-NEXT: [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
-; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
-; VF2-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT: [[TMP23:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT: [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
-; VF2-NEXT: [[TMP24:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP23]]
-; VF2-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP24]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8
-; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
+; VF2-NEXT: [[TMP8:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; VF2-NEXT: store <2 x i64> [[TMP8]], ptr [[TMP7]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
@@ -990,34 +957,22 @@ define void @test_2xi64_sub_of_wide_loads(ptr noalias %data, ptr noalias %A, ptr
; VF2: [[VECTOR_PH]]:
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2: [[VECTOR_BODY]]:
-; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VF2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; VF2-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
-; VF2-NEXT: [[BROADCAST_SPLAT4:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
+; VF2-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP3]], align 8
+; VF2-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i64 0
+; VF2-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> poison, <2 x i32> zeroinitializer
; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 2
-; VF2-NEXT: [[BROADCAST_SPLAT2:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
-; VF2-NEXT: [[BROADCAST_SPLAT6:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
-; VF2-NEXT: [[TMP12:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
+; VF2-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF2-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP11]], i64 0
+; VF2-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT5]], <2 x i64> poison, <2 x i32> zeroinitializer
; VF2-NEXT: [[TMP13:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT4]], [[BROADCAST_SPLAT6]]
-; VF2-NEXT: [[TMP19:%.*]] = shl nsw i64 [[TMP0]], 1
-; VF2-NEXT: [[TMP20:%.*]] = shl nsw i64 [[TMP1]], 1
-; VF2-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP19]]
+; VF2-NEXT: [[TMP20:%.*]] = shl nsw i64 [[TMP0]], 1
; VF2-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP20]]
-; VF2-NEXT: [[TMP14:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
-; VF2-NEXT: [[TMP15:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT4]], [[BROADCAST_SPLAT6]]
-; VF2-NEXT: [[TMP16:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP16]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[DATA_0]], align 8
-; VF2-NEXT: [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC4:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC4]], ptr [[DATA_1]], align 8
-; VF2-NEXT: [[IV_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2-NEXT: store <2 x i64> [[TMP13]], ptr [[DATA_1]], align 8
+; VF2-NEXT: [[IV_NEXT]] = add nuw i64 [[TMP0]], 1
; VF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
; VF2-NEXT: br i1 [[EC]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
@@ -1085,34 +1040,21 @@ define void @test_2xi64_sub_of_wide_loads_ops_swapped(ptr noalias %data, ptr noa
; VF2: [[VECTOR_PH]]:
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2: [[VECTOR_BODY]]:
-; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VF2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; VF2-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
-; VF2-NEXT: [[BROADCAST_SPLAT4:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
+; VF2-NEXT: [[BROADCAST_SPLAT4:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 2
-; VF2-NEXT: [[BROADCAST_SPLAT2:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
-; VF2-NEXT: [[BROADCAST_SPLAT6:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
-; VF2-NEXT: [[TMP12:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
+; VF2-NEXT: [[BROADCAST_SPLAT6:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
; VF2-NEXT: [[TMP13:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT4]], [[BROADCAST_SPLAT6]]
-; VF2-NEXT: [[TMP19:%.*]] = shl nsw i64 [[TMP0]], 1
-; VF2-NEXT: [[TMP20:%.*]] = shl nsw i64 [[TMP1]], 1
-; VF2-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP19]]
+; VF2-NEXT: [[TMP20:%.*]] = shl nsw i64 [[TMP0]], 1
; VF2-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP20]]
-; VF2-NEXT: [[TMP14:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT]]
; VF2-NEXT: [[TMP15:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT6]], [[BROADCAST_SPLAT4]]
-; VF2-NEXT: [[TMP16:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP16]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[DATA_0]], align 8
; VF2-NEXT: [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; VF2-NEXT: [[INTERLEAVED_VEC4:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC4]], ptr [[DATA_1]], align 8
-; VF2-NEXT: [[IV_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2-NEXT: [[IV_NEXT]] = add nuw i64 [[TMP0]], 2
; VF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
; VF2-NEXT: br i1 [[EC]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
index aca3ca54b886a..00054a8ccdea7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5
-; RUN: opt -p loop-vectorize -force-vector-width=2 -S %s | FileCheck --check-prefixes=VF2 %s
-; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck --check-prefixes=VF4 %s
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF2 %s
+; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF4 %s
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macosx"
@@ -13,18 +13,12 @@ define void @load_store_interleave_group(ptr noalias %data) {
; VF2: [[VECTOR_PH]]:
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2: [[VECTOR_BODY]]:
-; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VF2-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 2
-; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2-NEXT: [[TMP6:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[TMP8:%.*]] = shl nsw i64 [[TMP6]], 1
-; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
-; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; VF2-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP2]], align 8
; VF2-NEXT: store <2 x i64> [[WIDE_LOAD1]], ptr [[TMP5]], align 8
-; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP6]], 1
; VF2-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
@@ -83,20 +77,13 @@ define void @load_store_interleave_group_
diff erent_objecs(ptr noalias %src, ptr
; VF2: [[VECTOR_PH]]:
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2: [[VECTOR_BODY]]:
-; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VF2-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 2
-; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2-NEXT: [[TMP6:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[TMP8:%.*]] = shl nsw i64 [[TMP6]], 1
-; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
; VF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP8]]
-; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8
-; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP1]]
; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP8]]
-; VF2-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP3]], align 8
; VF2-NEXT: store <2 x i64> [[WIDE_LOAD1]], ptr [[TMP7]], align 8
-; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP6]], 1
; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
@@ -158,25 +145,16 @@ define void @single_wide_load_store_interleave_group(ptr noalias %src, ptr noali
; VF2: [[VECTOR_PH]]:
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2: [[VECTOR_BODY]]:
-; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VF2-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 2
-; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2-NEXT: [[TMP7:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[TMP9:%.*]] = shl nsw i64 [[TMP7]], 1
-; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP0]]
+; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP7]]
; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; VF2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
-; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8
-; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP1]]
+; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP9]]
-; VF2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[WIDE_LOAD]], <2 x i64> [[WIDE_LOAD]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; VF2-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[WIDE_LOAD1]], <2 x i64> [[WIDE_LOAD1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; VF2-NEXT: [[INTERLEAVED_VEC2:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC2]], ptr [[TMP8]], align 8
-; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP7]], 2
; VF2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
@@ -190,29 +168,20 @@ define void @single_wide_load_store_interleave_group(ptr noalias %src, ptr noali
; VF4: [[VECTOR_PH]]:
; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
; VF4: [[VECTOR_BODY]]:
-; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VF4-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 4
-; VF4-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF4-NEXT: [[TMP7:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF4-NEXT: [[TMP9:%.*]] = shl nsw i64 [[TMP7]], 1
-; VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP0]]
+; VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP7]]
; VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
-; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
-; VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
-; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP1]]
+; VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP9]]
-; VF4-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> [[WIDE_LOAD]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; VF4-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; VF4-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD1]], <4 x i64> [[WIDE_LOAD1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; VF4-NEXT: [[INTERLEAVED_VEC2:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
; VF4-NEXT: store <8 x i64> [[INTERLEAVED_VEC2]], ptr [[TMP8]], align 8
-; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; VF4-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
-; VF4-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP7]], 4
+; VF4-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; VF4: [[MIDDLE_BLOCK]]:
-; VF4-NEXT: br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
; VF4: [[SCALAR_PH]]:
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index 8482126ea7235..f226ae98e4225 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -19,23 +19,14 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP3]], align 8
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i64> [[STRIDED_VEC2]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC1]]
-; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
-; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC3]]
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP8]], <8 x i64> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP10]], <16 x i64> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[TMP3]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 1
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
More information about the llvm-commits
mailing list