[llvm] [VPlan] Add transformation to narrow interleave groups. (PR #106441)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 7 07:17:39 PDT 2024
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/106441
>From 36c68a2c088a0edb15d57b58008b3e37a3db45bf Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 25 Sep 2024 16:53:00 +0100
Subject: [PATCH 1/7] [VPlan] Use pointer to member 0 as VPInterleaveRecipe's
pointer arg.
---
.../Vectorize/LoopVectorizationPlanner.h | 10 +-
.../Transforms/Vectorize/LoopVectorize.cpp | 4 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 8 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 37 +++----
.../Transforms/Vectorize/VPlanTransforms.cpp | 46 +++++++--
.../Transforms/Vectorize/VPlanTransforms.h | 4 +-
...rleaved-store-of-first-order-recurrence.ll | 3 +-
.../AArch64/sve-interleaved-accesses.ll | 30 +++---
.../sve-interleaved-masked-accesses.ll | 16 ++-
.../RISCV/interleaved-accesses.ll | 99 +++----------------
.../LoopVectorize/X86/interleave-cost.ll | 17 ++--
.../X86/vectorize-interleaved-accesses-gap.ll | 5 +-
.../x86-interleaved-accesses-masked-group.ll | 18 ++--
...86-interleaved-store-accesses-with-gaps.ll | 12 +--
...aved-accesses-different-insert-position.ll | 8 +-
.../LoopVectorize/interleaved-accesses.ll | 25 ++---
.../LoopVectorize/vplan-printing.ll | 5 +-
17 files changed, 140 insertions(+), 207 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 00eec0a6f7b14e..5951873a960af2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -220,9 +220,15 @@ class VPBuilder {
new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
}
- VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL,
+ VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
const Twine &Name = "") {
- return createInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, DL, Name);
+ return tryInsertInstruction(new VPInstruction(
+ Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(false), DL, Name));
+ }
+ VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
+ const Twine &Name = "") {
+ return tryInsertInstruction(new VPInstruction(
+ Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(true), DL, Name));
}
VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index db4631e19c11d3..30fadc2c939418 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9005,8 +9005,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
- VPlanTransforms::createInterleaveGroups(InterleaveGroups, RecipeBuilder,
- CM.isScalarEpilogueAllowed());
+ VPlanTransforms::createInterleaveGroups(
+ *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
for (ElementCount VF : Range)
Plan->addVF(VF);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index bbcfaf9e19cd0c..f841355d320930 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -956,7 +956,6 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {}
};
-protected:
struct GEPFlagsTy {
char IsInBounds : 1;
GEPFlagsTy(bool IsInBounds) : IsInBounds(IsInBounds) {}
@@ -1307,6 +1306,13 @@ class VPInstruction : public VPRecipeWithIRFlags,
assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint");
}
+ VPInstruction(VPValue *Ptr, VPValue *Offset, GEPFlagsTy Flags = {false},
+ DebugLoc DL = {}, const Twine &Name = "")
+ : VPRecipeWithIRFlags(VPDef::VPInstructionSC,
+ ArrayRef<VPValue *>({Ptr, Offset}),
+ GEPFlagsTy(Flags), DL),
+ Opcode(VPInstruction::PtrAdd), Name(Name.str()) {}
+
VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = "");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index dacba152611c19..1a039e0a736145 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -644,7 +644,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
"can only generate first lane for PtrAdd");
Value *Ptr = State.get(getOperand(0), /* IsScalar */ true);
Value *Addend = State.get(getOperand(1), /* IsScalar */ true);
- return Builder.CreatePtrAdd(Ptr, Addend, Name);
+ return isInBounds() ? Builder.CreateInBoundsPtrAdd(Ptr, Addend, Name)
+ : Builder.CreatePtrAdd(Ptr, Addend, Name);
}
case VPInstruction::ResumePhi: {
Value *IncomingFromVPlanPred =
@@ -2470,15 +2471,12 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
unsigned InterleaveFactor = Group->getFactor();
auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
- // Prepare for the new pointers.
- unsigned Index = Group->getIndex(Instr);
-
// TODO: extend the masked interleaved-group support to reversed access.
VPValue *BlockInMask = getMask();
assert((!BlockInMask || !Group->isReverse()) &&
"Reversed masked interleave-group not supported.");
- Value *Idx;
+ Value *Index;
// If the group is reverse, adjust the index to refer to the last vector lane
// instead of the first. We adjust the index from the first vector lane,
// rather than directly getting the pointer for lane VF - 1, because the
@@ -2486,35 +2484,24 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
if (Group->isReverse()) {
Value *RuntimeVF =
getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
- Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
- Idx = State.Builder.CreateMul(Idx,
- State.Builder.getInt32(Group->getFactor()));
- Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
- Idx = State.Builder.CreateNeg(Idx);
- } else
- Idx = State.Builder.getInt32(-Index);
+ Index = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
+ Index = State.Builder.CreateMul(Index,
+ State.Builder.getInt32(Group->getFactor()));
+ Index = State.Builder.CreateNeg(Index);
+ } else {
+ // TODO: Drop redundant 0-index GEP as follow-up.
+ Index = State.Builder.getInt32(0);
+ }
VPValue *Addr = getAddr();
Value *ResAddr = State.get(Addr, VPLane(0));
if (auto *I = dyn_cast<Instruction>(ResAddr))
State.setDebugLocFrom(I->getDebugLoc());
- // Notice current instruction could be any index. Need to adjust the address
- // to the member of index 0.
- //
- // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
- // b = A[i]; // Member of index 0
- // Current pointer is pointed to A[i+1], adjust it to A[i].
- //
- // E.g. A[i+1] = a; // Member of index 1
- // A[i] = b; // Member of index 0
- // A[i+2] = c; // Member of index 2 (Current instruction)
- // Current pointer is pointed to A[i+2], adjust it to A[i].
-
bool InBounds = false;
if (auto *gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
InBounds = gep->isInBounds();
- ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Idx, "", InBounds);
+ ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds);
State.setDebugLocFrom(Instr->getDebugLoc());
Value *PoisonVec = PoisonValue::get(VecTy);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a878613c4ba483..3b7f066f0636f3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1590,14 +1590,19 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
}
void VPlanTransforms::createInterleaveGroups(
- const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> &InterleaveGroups,
+ VPlan &Plan,
+ const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
+ &InterleaveGroups,
VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed) {
+ if (InterleaveGroups.empty())
+ return;
+
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
+ VPDominatorTree VPDT;
+ VPDT.recalculate(Plan);
for (const auto *IG : InterleaveGroups) {
- auto *Recipe =
- cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
SmallVector<VPValue *, 4> StoredValues;
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
@@ -1607,9 +1612,38 @@ void VPlanTransforms::createInterleaveGroups(
bool NeedsMaskForGaps =
IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed;
- auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
- Recipe->getMask(), NeedsMaskForGaps);
- VPIG->insertBefore(Recipe);
+
+ Instruction *IRInsertPos = IG->getInsertPos();
+ auto *InsertPos =
+ cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
+
+ // Get or create the start address for the interleave group.
+ auto *Start =
+ cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
+ VPValue *Addr = Start->getAddr();
+ if (!VPDT.properlyDominates(Addr->getDefiningRecipe(), InsertPos)) {
+ bool InBounds = false;
+ if (auto *Gep = dyn_cast<GetElementPtrInst>(
+ getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
+ InBounds = Gep->isInBounds();
+
+ // We cannot re-use the address of the first member because it does not
+ // dominate the insert position. Use the address of the insert position
+ // and create a PtrAdd to adjust the index to start at the first member.
+ APInt Offset(32,
+ getLoadStoreType(IRInsertPos)->getScalarSizeInBits() / 8 *
+ IG->getIndex(IRInsertPos),
+ /*IsSigned=*/true);
+ VPValue *OffsetVPV = Plan.getOrAddLiveIn(
+ ConstantInt::get(IRInsertPos->getParent()->getContext(), -Offset));
+ VPBuilder B(InsertPos);
+ Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
+ : B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
+ }
+ auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
+ InsertPos->getMask(), NeedsMaskForGaps);
+ VPIG->insertBefore(InsertPos);
+
unsigned J = 0;
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (Instruction *Member = IG->getMember(i)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index f4a17aec42b244..3b792ee32dce6e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -114,7 +114,9 @@ struct VPlanTransforms {
// widening its memory instructions with a single VPInterleaveRecipe at its
// insertion point.
static void createInterleaveGroups(
- const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> &InterleaveGroups,
+ VPlan &Plan,
+ const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
+ &InterleaveGroups,
VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed);
/// Remove dead recipes from \p Plan.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-store-of-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-store-of-first-order-recurrence.ll
index 87674f611251cc..997ef7466d5cfc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-store-of-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-store-of-first-order-recurrence.ll
@@ -14,8 +14,7 @@ define void @interleaved_store_first_order_recurrence(ptr noalias %src, ptr %dst
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw nsw i64 [[TMP0]], 3
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 2
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 -2
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> [[TMP10]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 0959d913fd0cd5..ba4145217c3ba5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -41,13 +41,11 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 {
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT: [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
; CHECK-NEXT: [[TMP6:%.*]] = add nsw <vscale x 4 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[BROADCAST_SPLAT2]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP5]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 -4
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
-; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4
+; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -121,6 +119,7 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.+]] = shl i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, <vscale x 4 x i64> [[VEC_IND]]
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
; CHECK-NEXT: [[TMP7:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
@@ -128,13 +127,11 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
; CHECK-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER1]] to <vscale x 4 x i32>
; CHECK-NEXT: [[TMP12:%.*]] = mul nsw <vscale x 4 x i32> [[BROADCAST_SPLAT3]], [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i64> [[TMP7]], i64 0
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP13]]
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
-; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
+; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP14]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
@@ -404,10 +401,10 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP15]], 3
-; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 1, [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
; CHECK-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
@@ -715,16 +712,14 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT: [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
; CHECK-NEXT: [[TMP9:%.*]] = add nsw <vscale x 4 x i32> [[TMP8]], [[TMP7]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 -4
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP9]])
-; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 4
+; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -1271,14 +1266,11 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
+; CHECK-NEXT: [[P:%.+]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
-; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
+; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[P]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 73e94f2e7ee099..baec7daa463d13 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -55,15 +55,13 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or disjoint i32 [[TMP8]], 1
; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sext i32 [[TMP13]] to i64
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sext i32 [[TMP8]] to i64
; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP20]]
; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -135,15 +133,13 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or disjoint i32 [[TMP7]], 1
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sext i32 [[TMP13]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sext i32 [[TMP7]] to i64
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
-; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP20]]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index 42d3148ac96315..f70e9d6c9416ac 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -29,12 +29,9 @@ define void @load_store_factor2_i32(ptr %p) {
; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP13]]
; CHECK-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 -1
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP15]])
-; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 4
+; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -77,13 +74,10 @@ define void @load_store_factor2_i32(ptr %p) {
; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; FIXED-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
-; FIXED-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP5]]
; FIXED-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP6]], i32 -1
; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; FIXED-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
+; FIXED-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 4
; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -135,12 +129,9 @@ define void @load_store_factor2_i32(ptr %p) {
; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
; SCALABLE-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT: [[TMP13:%.*]] = add i64 [[TMP7]], 1
-; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP13]]
; SCALABLE-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 -1
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP15]])
-; SCALABLE-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 4
+; SCALABLE-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -218,12 +209,9 @@ define void @load_store_factor2_i64(ptr %p) {
; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP13]]
; CHECK-NEXT: [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i64, ptr [[TMP14]], i32 -1
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP15]])
-; CHECK-NEXT: store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8
+; CHECK-NEXT: store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -266,13 +254,10 @@ define void @load_store_factor2_i64(ptr %p) {
; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
-; FIXED-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
-; FIXED-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]]
; FIXED-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
-; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[TMP6]], i32 -1
; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; FIXED-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
+; FIXED-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -324,12 +309,9 @@ define void @load_store_factor2_i64(ptr %p) {
; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
; SCALABLE-NEXT: [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; SCALABLE-NEXT: [[TMP13:%.*]] = add i64 [[TMP7]], 1
-; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP13]]
; SCALABLE-NEXT: [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; SCALABLE-NEXT: [[TMP16:%.*]] = getelementptr i64, ptr [[TMP14]], i32 -1
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP15]])
-; SCALABLE-NEXT: store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8
+; SCALABLE-NEXT: store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 8
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -398,17 +380,13 @@ define void @load_store_factor3_i32(ptr %p) {
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP5]], 1
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 -2
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
+; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -456,17 +434,13 @@ define void @load_store_factor3_i32(ptr %p) {
; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; FIXED-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
; FIXED-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; FIXED-NEXT: [[TMP7:%.*]] = add i64 [[TMP5]], 1
-; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP7]]
; FIXED-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 -2
; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; FIXED-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
+; FIXED-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 4
; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -514,17 +488,13 @@ define void @load_store_factor3_i32(ptr %p) {
; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
; SCALABLE-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; SCALABLE-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
; SCALABLE-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; SCALABLE-NEXT: [[TMP7:%.*]] = add i64 [[TMP5]], 1
-; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP7]]
; SCALABLE-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 -2
; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; SCALABLE-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SCALABLE-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; SCALABLE-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
+; SCALABLE-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 4
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -603,17 +573,13 @@ define void @load_store_factor3_i64(ptr %p) {
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP5]], 1
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], <i64 3, i64 3, i64 3, i64 3>
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 -2
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -661,17 +627,13 @@ define void @load_store_factor3_i64(ptr %p) {
; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
-; FIXED-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
; FIXED-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
-; FIXED-NEXT: [[TMP7:%.*]] = add i64 [[TMP5]], 1
-; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]]
; FIXED-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], <i64 3, i64 3, i64 3, i64 3>
-; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 -2
; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; FIXED-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; FIXED-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -719,17 +681,13 @@ define void @load_store_factor3_i64(ptr %p) {
; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
; SCALABLE-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
-; SCALABLE-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
; SCALABLE-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
-; SCALABLE-NEXT: [[TMP7:%.*]] = add i64 [[TMP5]], 1
-; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]]
; SCALABLE-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], <i64 3, i64 3, i64 3, i64 3>
-; SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 -2
; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; SCALABLE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; SCALABLE-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; SCALABLE-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; SCALABLE-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -813,22 +771,13 @@ define void @load_store_factor8(ptr %p) {
; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], <i64 1, i64 1>
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], <i64 2, i64 2>
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP5]], 1
; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], <i64 3, i64 3>
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP7]], 1
; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], <i64 4, i64 4>
-; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP9]], 1
; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], <i64 5, i64 5>
-; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP11]], 1
; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], <i64 6, i64 6>
-; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP13]], 1
; CHECK-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], <i64 7, i64 7>
-; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP15]], 1
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP17]]
; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], <i64 8, i64 8>
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP18]], i32 -7
; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -837,7 +786,7 @@ define void @load_store_factor8(ptr %p) {
; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP20]], align 8
+; CHECK-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -915,22 +864,13 @@ define void @load_store_factor8(ptr %p) {
; FIXED-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
; FIXED-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
; FIXED-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], <i64 1, i64 1>
-; FIXED-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
; FIXED-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], <i64 2, i64 2>
-; FIXED-NEXT: [[TMP7:%.*]] = add i64 [[TMP5]], 1
; FIXED-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], <i64 3, i64 3>
-; FIXED-NEXT: [[TMP9:%.*]] = add i64 [[TMP7]], 1
; FIXED-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], <i64 4, i64 4>
-; FIXED-NEXT: [[TMP11:%.*]] = add i64 [[TMP9]], 1
; FIXED-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], <i64 5, i64 5>
-; FIXED-NEXT: [[TMP13:%.*]] = add i64 [[TMP11]], 1
; FIXED-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], <i64 6, i64 6>
-; FIXED-NEXT: [[TMP15:%.*]] = add i64 [[TMP13]], 1
; FIXED-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], <i64 7, i64 7>
-; FIXED-NEXT: [[TMP17:%.*]] = add i64 [[TMP15]], 1
-; FIXED-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP17]]
; FIXED-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], <i64 8, i64 8>
-; FIXED-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP18]], i32 -7
; FIXED-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; FIXED-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; FIXED-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -939,7 +879,7 @@ define void @load_store_factor8(ptr %p) {
; FIXED-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; FIXED-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP20]], align 8
+; FIXED-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; FIXED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; FIXED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1017,22 +957,13 @@ define void @load_store_factor8(ptr %p) {
; SCALABLE-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
; SCALABLE-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
; SCALABLE-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], <i64 1, i64 1>
-; SCALABLE-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
; SCALABLE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], <i64 2, i64 2>
-; SCALABLE-NEXT: [[TMP7:%.*]] = add i64 [[TMP5]], 1
; SCALABLE-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], <i64 3, i64 3>
-; SCALABLE-NEXT: [[TMP9:%.*]] = add i64 [[TMP7]], 1
; SCALABLE-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], <i64 4, i64 4>
-; SCALABLE-NEXT: [[TMP11:%.*]] = add i64 [[TMP9]], 1
; SCALABLE-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], <i64 5, i64 5>
-; SCALABLE-NEXT: [[TMP13:%.*]] = add i64 [[TMP11]], 1
; SCALABLE-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], <i64 6, i64 6>
-; SCALABLE-NEXT: [[TMP15:%.*]] = add i64 [[TMP13]], 1
; SCALABLE-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], <i64 7, i64 7>
-; SCALABLE-NEXT: [[TMP17:%.*]] = add i64 [[TMP15]], 1
-; SCALABLE-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP17]]
; SCALABLE-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], <i64 8, i64 8>
-; SCALABLE-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP18]], i32 -7
; SCALABLE-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SCALABLE-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SCALABLE-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1041,7 +972,7 @@ define void @load_store_factor8(ptr %p) {
; SCALABLE-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; SCALABLE-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; SCALABLE-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP20]], align 8
+; SCALABLE-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; SCALABLE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; SCALABLE-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index cc1d11754b27ed..9383799b181c82 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -89,12 +89,11 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP40]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP41:%.*]] = shl i64 [[TMP39]], 2
+; CHECK-NEXT: [[TMP44:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[TMP41]]
; CHECK-NEXT: [[TMP42:%.*]] = load float, ptr [[P_INVAR]], align 4
; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <2 x float> poison, float [[TMP42]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT27]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP43:%.*]] = or disjoint i64 [[TMP41]], 3
-; CHECK-NEXT: [[TMP44:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[TMP43]]
-; CHECK-NEXT: [[TMP45:%.*]] = getelementptr float, ptr [[TMP44]], i32 -3
+; CHECK-NEXT: [[TMP45:%.*]] = getelementptr float, ptr [[TMP44]], i32 0
; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLAT]], <2 x float> [[BROADCAST_SPLAT28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <4 x float> [[TMP46]], <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP47]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
@@ -102,8 +101,8 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali
; CHECK-NEXT: [[TMP48:%.*]] = load float, ptr [[P_INVAR]], align 4
; CHECK-NEXT: [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <2 x float> poison, float [[TMP48]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT30:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT29]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP49:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[TMP43]]
-; CHECK-NEXT: [[TMP50:%.*]] = getelementptr float, ptr [[TMP49]], i32 -3
+; CHECK-NEXT: [[TMP49:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[TMP41]]
+; CHECK-NEXT: [[TMP50:%.*]] = getelementptr float, ptr [[TMP49]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLAT30]], <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLAT36]], <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[INTERLEAVED_VEC31:%.*]] = shufflevector <8 x float> [[TMP51]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
@@ -266,8 +265,7 @@ define void @geps_feeding_interleave_groups_with_reuse(ptr %arg, i64 %arg1, ptr
; CHECK-NEXT: [[TMP35:%.*]] = fmul <2 x float> [[TMP34]], zeroinitializer
; CHECK-NEXT: [[TMP36:%.*]] = fadd <2 x float> [[STRIDED_VEC16]], [[STRIDED_VEC20]]
; CHECK-NEXT: [[TMP37:%.*]] = fmul <2 x float> [[TMP36]], zeroinitializer
-; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[TMP28]], i64 12
-; CHECK-NEXT: [[TMP39:%.*]] = getelementptr float, ptr [[TMP38]], i32 -3
+; CHECK-NEXT: [[TMP39:%.*]] = getelementptr float, ptr [[TMP28]], i32 0
; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <2 x float> [[TMP31]], <2 x float> [[TMP33]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <2 x float> [[TMP35]], <2 x float> [[TMP37]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <4 x float> [[TMP40]], <4 x float> [[TMP41]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -490,11 +488,10 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP53]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
; CHECK-NEXT: [[STRIDED_VEC34:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP50]]
; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[B]], <4 x i64> [[VEC_IND]]
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP54]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison), !alias.scope [[META6:![0-9]+]]
-; CHECK-NEXT: [[TMP55:%.*]] = or disjoint i64 [[TMP50]], 7
-; CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP55]]
-; CHECK-NEXT: [[TMP57:%.*]] = getelementptr i32, ptr [[TMP56]], i32 -7
+; CHECK-NEXT: [[TMP57:%.*]] = getelementptr i32, ptr [[TMP56]], i32 0
; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC34]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
index 6e83cf612f82bd..0bbf76edde2626 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
@@ -67,9 +67,8 @@ define void @test_pr59090(ptr %l_out, ptr noalias %b) #0 {
; CHECK-NEXT: store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]]
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]]
; CHECK: pred.store.continue14:
-; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP2]], 2
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[L_OUT:%.*]], i64 [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 -2
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[L_OUT:%.*]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
; CHECK-NEXT: [[TMP15:%.*]] = and <48 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false>
; CHECK-NEXT: call void @llvm.masked.store.v48i8.p0(<48 x i8> <i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison>, ptr [[TMP14]], i32 1, <48 x i1> [[TMP15]])
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index 6b52023cfbcaec..968058134690bc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -1419,13 +1419,11 @@ define dso_local void @masked_strided2(ptr noalias nocapture readonly %p, ptr no
; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = or disjoint i32 [[TMP1]], 1
; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC1]])
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP1]]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = sub <8 x i8> zeroinitializer, [[TMP4]]
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP3]]
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 -1
; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP7]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP6]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
@@ -2555,13 +2553,11 @@ define dso_local void @masked_strided2_unknown_tc(ptr noalias nocapture readonly
; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = or disjoint i32 [[TMP2]], 1
; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC3]])
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP2]]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]]
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP5]]
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 -1
; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP9]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP8]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -2989,13 +2985,11 @@ define dso_local void @unconditional_masked_strided2_unknown_tc(ptr noalias noca
; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = or disjoint i32 [[TMP1]], 1
; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC3]])
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[Q:%.*]], i32 [[TMP1]]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = sub <8 x i8> zeroinitializer, [[TMP4]]
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[Q:%.*]], i32 [[TMP3]]
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -1
; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP7]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP6]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP8]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
index fabe2eb8062bbe..b850dc3ecef85d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
@@ -72,17 +72,15 @@ define dso_local void @test1(ptr noalias nocapture %points, ptr noalias nocaptur
;
; ENABLED_MASKED_STRIDED-LABEL: @test1(
; ENABLED_MASKED_STRIDED-NEXT: entry:
-; ENABLED_MASKED_STRIDED-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[POINTS:%.*]], i64 -2
; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]]
; ENABLED_MASKED_STRIDED: vector.body:
; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[INDEX]]
; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDEX]], 2
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 3
+; ENABLED_MASKED_STRIDED-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[POINTS:%.*]], i64 [[TMP1]]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[INDEX]]
; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP1]], 1
-; ENABLED_MASKED_STRIDED-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP3]]
; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD1]], <16 x i32> <i32 0, i32 4, i32 poison, i32 poison, i32 1, i32 5, i32 poison, i32 poison, i32 2, i32 6, i32 poison, i32 poison, i32 3, i32 7, i32 poison, i32 poison>
; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i16.p0(<16 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], i32 2, <16 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>)
; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -239,7 +237,6 @@ define dso_local void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr
; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; ENABLED_MASKED_STRIDED-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[POINTS:%.*]], i64 -2
; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]]
; ENABLED_MASKED_STRIDED: vector.body:
; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -249,11 +246,10 @@ define dso_local void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr
; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[INDEX]]
; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP1]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison)
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nsw i64 [[INDEX]], 2
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nsw i64 [[INDEX]], 3
+; ENABLED_MASKED_STRIDED-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[POINTS:%.+]], i64 [[TMP2]]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[INDEX]]
; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP3]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison)
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = or disjoint i64 [[TMP2]], 1
-; ENABLED_MASKED_STRIDED-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP4]]
; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_MASKED_LOAD]], <4 x i16> [[WIDE_MASKED_LOAD3]], <16 x i32> <i32 0, i32 4, i32 poison, i32 poison, i32 1, i32 5, i32 poison, i32 poison, i32 2, i32 6, i32 poison, i32 poison, i32 3, i32 7, i32 poison, i32 poison>
; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
index 5028718dc49d15..8773350bdb4243 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
@@ -18,7 +18,8 @@ define void @gep_for_first_member_does_not_dominate_insert_point(ptr %str, ptr n
; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i64 [[TMP1]], 1
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[STR]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 -1
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i8>, ptr [[TMP41]], align 1
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i8> [[WIDE_VEC]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i8> [[WIDE_VEC]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i8> [[STRIDED_VEC2]], [[STRIDED_VEC]]
@@ -92,8 +93,9 @@ define void @test_ig_insert_pos_at_end_of_vpbb(ptr noalias %dst, ptr noalias %sr
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr { i16, i16, i16, i16 }, ptr [[SRC]], i64 [[TMP3]], i32 2
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 -2
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP5]], align 2
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 -4
+; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP51]], align 2
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i16> [[STRIDED_VEC]], i32 3
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 12d0078617d932..cba35dcb78ca43 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -38,13 +38,11 @@ define void @test_array_load2_store2(i32 %C, i32 %D) {
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[TMP1:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP1]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -4
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
+; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -115,9 +113,9 @@ define void @test_struct_array_load3_store3() {
; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
; CHECK-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 [[INDEX]]
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
@@ -270,10 +268,10 @@ define void @test_struct_store4(ptr noalias nocapture readonly %A, ptr noalias n
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[NEXT_GEP]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
@@ -357,8 +355,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND]]
; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -28
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -24
; CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE3]], <4 x i32> [[REVERSE4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -669,15 +667,13 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[TMP1:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -4
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
+; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -1286,19 +1282,16 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[OFFSET_IDX]], 2
; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[OFFSET_IDX]], 4
; CHECK-NEXT: [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 6
-; CHECK-NEXT: [[TMP6:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
; CHECK-NEXT: store i32 [[X:%.*]], ptr [[TMP7]], align 4
; CHECK-NEXT: store i32 [[X]], ptr [[TMP8]], align 4
; CHECK-NEXT: store i32 [[X]], ptr [[TMP9]], align 4
; CHECK-NEXT: store i32 [[X]], ptr [[TMP10]], align 4
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 -4
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP12]], align 4
+; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 26974c2307065a..b36e7cf092c855 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -349,10 +349,9 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
; CHECK-NEXT: ir<%AB.0> = load from index 0
; CHECK-NEXT: ir<%AB.1> = load from index 1
; CHECK-NEXT: ir<%AB.3> = load from index 3
-; CHECK-NEXT: CLONE ir<%iv.plus.3> = add vp<[[STEPS]]>, ir<3>
; CHECK-NEXT: WIDEN ir<%add> = add nsw ir<%AB.0>, ir<%AB.1>
-; CHECK-NEXT: CLONE ir<%gep.CD.3> = getelementptr inbounds ir<@CD>, ir<0>, ir<%iv.plus.3>
-; CHECK-NEXT: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%gep.CD.3>
+; CHECK-NEXT: CLONE ir<%gep.CD.0> = getelementptr inbounds ir<@CD>, ir<0>, vp<[[STEPS]]>
+; CHECK-NEXT: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%gep.CD.0>
; CHECK-NEXT: store ir<%add> to index 0
; CHECK-NEXT: store ir<1> to index 1
; CHECK-NEXT: store ir<2> to index 2
>From 725a1e7cab8163a9968337ebdd3e0f1269443588 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 15 Aug 2024 13:38:58 +0100
Subject: [PATCH 2/7] [LV] Add
---
...sform-narrow-interleave-to-widen-memory.ll | 779 ++++++++
...sform-narrow-interleave-to-widen-memory.ll | 1677 +++++++++++++++++
2 files changed, 2456 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
new file mode 100644
index 00000000000000..d6cd44d0c4f0ca
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -0,0 +1,779 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake -S %s | FileCheck %s
+; https://github.com/llvm/llvm-project/issues/82936
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
+define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_4xi64(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP14:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC2]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 3
+; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC3]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 -3
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP13]], <16 x i64> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
+; CHECK-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT: store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT: [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 1
+; CHECK-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 2
+; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
+; CHECK-NEXT: [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
+; CHECK-NEXT: store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; CHECK-NEXT: [[DATA_3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 3
+; CHECK-NEXT: [[L_3:%.*]] = load i64, ptr [[DATA_3]], align 8
+; CHECK-NEXT: [[MUL_3:%.*]] = mul i64 [[L_FACTOR]], [[L_3]]
+; CHECK-NEXT: store i64 [[MUL_3]], ptr [[DATA_3]], align 8
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+ %l.factor = load i64, ptr %arrayidx, align 8
+ %data.0 = getelementptr inbounds { i64 , i64, i64, i64 }, ptr %data, i64 %iv, i32 0
+ %l.0 = load i64, ptr %data.0, align 8
+ %mul.0 = mul i64 %l.factor, %l.0
+ store i64 %mul.0, ptr %data.0, align 8
+ %data.1 = getelementptr inbounds { i64 , i64, i64, i64 }, ptr %data, i64 %iv, i32 1
+ %l.1 = load i64, ptr %data.1, align 8
+ %mul.1 = mul i64 %l.factor, %l.1
+ store i64 %mul.1, ptr %data.1, align 8
+ %data.2 = getelementptr inbounds { i64 , i64, i64, i64 }, ptr %data, i64 %iv, i32 2
+ %l.2 = load i64, ptr %data.2, align 8
+ %mul.2 = mul i64 %l.factor, %l.2
+ store i64 %mul.2, ptr %data.2, align 8
+ %data.3 = getelementptr inbounds { i64 , i64, i64, i64 }, ptr %data, i64 %iv, i32 3
+ %l.3 = load i64, ptr %data.3, align 8
+ %mul.3 = mul i64 %l.factor, %l.3
+ store i64 %mul.3, ptr %data.3, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+
+define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT: [[TMP7:%.*]] = or disjoint i64 [[TMP3]], 1
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT: store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+ %l.factor = load i64, ptr %arrayidx, align 8
+ %1 = shl nsw i64 %iv, 1
+ %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+ %l.0 = load i64, ptr %data.0, align 8
+ %mul.0 = mul i64 %l.factor, %l.0
+ store i64 %mul.0, ptr %data.0, align 8
+ %3 = or disjoint i64 %1, 1
+ %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+ %l.1 = load i64, ptr %data.1, align 8
+ %mul.1 = mul i64 %l.factor, %l.1
+ store i64 %mul.1, ptr %data.1, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64_interleave_loads_order_flipped(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP3]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -1
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT: store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+ %l.factor = load i64, ptr %arrayidx, align 8
+ %1 = shl nsw i64 %iv, 1
+ %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+ %l.0 = load i64, ptr %data.0, align 8
+ %3 = or disjoint i64 %1, 1
+ %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+ %l.1 = load i64, ptr %data.1, align 8
+ %mul.0 = mul i64 %l.factor, %l.1
+ store i64 %mul.0, ptr %data.0, align 8
+ %mul.1 = mul i64 %l.factor, %l.0
+ store i64 %mul.1, ptr %data.1, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64_store_order_flipped_1(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT: [[TMP7:%.*]] = or disjoint i64 [[TMP3]], 1
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT: [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; CHECK-NEXT: store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+ %l.factor = load i64, ptr %arrayidx, align 8
+ %1 = shl nsw i64 %iv, 1
+ %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+ %l.0 = load i64, ptr %data.0, align 8
+ %mul.0 = mul i64 %l.factor, %l.0
+ %3 = or disjoint i64 %1, 1
+ %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+ %l.1 = load i64, ptr %data.1, align 8
+ %mul.1 = mul i64 %l.factor, %l.1
+ store i64 %mul.1, ptr %data.0, align 8
+ store i64 %mul.0, ptr %data.1, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64_store_order_flipped_2(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP8]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT: [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT: store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+ %l.factor = load i64, ptr %arrayidx, align 8
+ %1 = shl nsw i64 %iv, 1
+ %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+ %l.0 = load i64, ptr %data.0, align 8
+ %mul.0 = mul i64 %l.factor, %l.0
+ %3 = or disjoint i64 %1, 1
+ %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+ %l.1 = load i64, ptr %data.1, align 8
+ %mul.1 = mul i64 %l.factor, %l.1
+ store i64 %mul.0, ptr %data.1, align 8
+ store i64 %mul.1, ptr %data.0, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noalias %src.0, ptr noalias %src.1, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64_different_loads_feeding_fmul(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[SRC_0:%.*]], ptr noalias [[SRC_1:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[TMP3]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[GEP_SRC_0]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP7:%.*]] = or disjoint i64 [[TMP5]], 1
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT: [[TMP12:%.*]] = mul <4 x i64> [[WIDE_LOAD2]], [[STRIDED_VEC]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
+; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP14]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP13]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV1]]
+; CHECK-NEXT: [[L_SRC_0:%.*]] = load i64, ptr [[GEP_SRC_2]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT: [[L_0:%.*]] = load i64, ptr [[GEP_SRC_2]], align 8
+; CHECK-NEXT: [[MUL_0:%.*]] = mul i64 [[L_SRC_0]], [[L_0]]
+; CHECK-NEXT: store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV1]]
+; CHECK-NEXT: [[L_SRC_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8
+; CHECK-NEXT: [[MUL_1:%.*]] = mul i64 [[L_SRC_1]], [[L_1]]
+; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %gep.src.0 = getelementptr inbounds i64, ptr %src.0, i64 %iv
+ %l.src.0 = load i64, ptr %gep.src.0, align 8
+ %1 = shl nsw i64 %iv, 1
+ %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+ %l.0 = load i64, ptr %gep.src.0, align 8
+ %mul.0 = mul i64 %l.src.0, %l.0
+ store i64 %mul.0, ptr %data.0, align 8
+ %3 = or disjoint i64 %1, 1
+ %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+ %l.1 = load i64, ptr %data.1, align 8
+ %gep.src.1 = getelementptr inbounds i64, ptr %src.1, i64 %iv
+ %l.src.1 = load i64, ptr %gep.src.1, align 8
+ %mul.1 = mul i64 %l.src.1, %l.1
+ store i64 %mul.1, ptr %data.1, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_3xi64(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 2
+; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC2]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -2
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> [[TMP11]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP12]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
+; CHECK-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT: store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT: [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 1
+; CHECK-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 2
+; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
+; CHECK-NEXT: [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
+; CHECK-NEXT: store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+ %l.factor = load i64, ptr %arrayidx, align 8
+ %data.0 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 0
+ %l.0 = load i64, ptr %data.0, align 8
+ %mul.0 = mul i64 %l.factor, %l.0
+ store i64 %mul.0, ptr %data.0, align 8
+ %data.1 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 1
+ %l.1 = load i64, ptr %data.1, align 8
+ %mul.1 = mul i64 %l.factor, %l.1
+ store i64 %mul.1, ptr %data.1, align 8
+ %data.2 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 2
+ %l.2 = load i64, ptr %data.2, align 8
+ %mul.2 = mul i64 %l.factor, %l.2
+ store i64 %mul.2, ptr %data.2, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @test_2xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi32(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT: [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT: [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT: store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT: [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT: store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+ %l.factor = load i32 , ptr %arrayidx, align 8
+ %data.0 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 0
+ %l.0 = load i32, ptr %data.0, align 8
+ %mul.0 = mul i32 %l.factor, %l.0
+ store i32 %mul.0, ptr %data.0, align 8
+ %data.1 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 1
+ %l.1 = load i32, ptr %data.1, align 8
+ %mul.1 = mul i32 %l.factor, %l.1
+ store i32%mul.1, ptr %data.1, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_3xi32(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 8, i64 [[N_MOD_VF]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[TMP1]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 8
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <24 x i32>, ptr [[TMP6]], align 8
+; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT: [[TMP7:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC2]]
+; CHECK-NEXT: [[TMP8:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 2
+; CHECK-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC4]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 -2
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP12]], <16 x i32> [[TMP13]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP14]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT: [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV1]], i32 0
+; CHECK-NEXT: [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT: store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT: [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV1]], i32 1
+; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT: store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT: [[DATA_2:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV1]], i32 2
+; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[DATA_2]], align 8
+; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[L_FACTOR]], [[L_2]]
+; CHECK-NEXT: store i32 [[MUL_2]], ptr [[DATA_2]], align 8
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+ %l.factor = load i32 , ptr %arrayidx, align 8
+ %data.0 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 0
+ %l.0 = load i32, ptr %data.0, align 8
+ %mul.0 = mul i32 %l.factor, %l.0
+ store i32 %mul.0, ptr %data.0, align 8
+ %data.1 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 1
+ %l.1 = load i32, ptr %data.1, align 8
+ %mul.1 = mul i32 %l.factor, %l.1
+ store i32%mul.1, ptr %data.1, align 8
+ %data.2 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 2
+ %l.2 = load i32, ptr %data.2, align 8
+ %mul.2 = mul i32 %l.factor, %l.2
+ store i32 %mul.2, ptr %data.2, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
new file mode 100644
index 00000000000000..accd340a0d8ea8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
@@ -0,0 +1,1677 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=2 -S %s | FileCheck --check-prefixes=VF2 %s
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck --check-prefixes=VF4 %s
+; https://github.com/llvm/llvm-project/issues/82936
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT: [[ENTRY:.*]]:
+; VF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2: [[VECTOR_PH]]:
+; VF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2: [[VECTOR_BODY]]:
+; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT: [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; VF2-NEXT: [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT: store i64 [[TMP13]], ptr [[TMP5]], align 8
+; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT: store i64 [[TMP14]], ptr [[TMP7]], align 8
+; VF2-NEXT: [[TMP15:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP15]], i32 0
+; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP16]]
+; VF2-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP15]], i32 1
+; VF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]]
+; VF2-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP17]], align 8
+; VF2-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 8
+; VF2-NEXT: [[TMP22:%.*]] = insertelement <2 x i64> poison, i64 [[TMP20]], i32 0
+; VF2-NEXT: [[TMP23:%.*]] = insertelement <2 x i64> [[TMP22]], i64 [[TMP21]], i32 1
+; VF2-NEXT: [[TMP24:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP23]]
+; VF2-NEXT: [[TMP25:%.*]] = extractelement <2 x i64> [[TMP24]], i32 0
+; VF2-NEXT: store i64 [[TMP25]], ptr [[TMP17]], align 8
+; VF2-NEXT: [[TMP26:%.*]] = extractelement <2 x i64> [[TMP24]], i32 1
+; VF2-NEXT: store i64 [[TMP26]], ptr [[TMP19]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT: br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2: [[MIDDLE_BLOCK]]:
+; VF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2: [[SCALAR_PH]]:
+; VF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT: br label %[[LOOP:.*]]
+; VF2: [[LOOP]]:
+; VF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT: [[TMP28:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF2-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT: store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT: [[TMP29:%.*]] = or disjoint i64 [[TMP28]], 1
+; VF2-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP29]]
+; VF2-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
+;
+; VF4-LABEL: define void @test_2xi64(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT: [[ENTRY:.*]]:
+; VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4: [[VECTOR_PH]]:
+; VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF4: [[VECTOR_BODY]]:
+; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT: [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; VF4-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; VF4-NEXT: [[TMP20:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
+; VF4-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
+; VF4-NEXT: store i64 [[TMP21]], ptr [[TMP5]], align 8
+; VF4-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
+; VF4-NEXT: store i64 [[TMP22]], ptr [[TMP7]], align 8
+; VF4-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
+; VF4-NEXT: store i64 [[TMP23]], ptr [[TMP9]], align 8
+; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; VF4-NEXT: store i64 [[TMP24]], ptr [[TMP11]], align 8
+; VF4-NEXT: [[TMP25:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0
+; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
+; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1
+; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2
+; VF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP30]]
+; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3
+; VF4-NEXT: [[TMP33:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP32]]
+; VF4-NEXT: [[TMP34:%.*]] = load i64, ptr [[TMP27]], align 8
+; VF4-NEXT: [[TMP35:%.*]] = load i64, ptr [[TMP29]], align 8
+; VF4-NEXT: [[TMP36:%.*]] = load i64, ptr [[TMP31]], align 8
+; VF4-NEXT: [[TMP37:%.*]] = load i64, ptr [[TMP33]], align 8
+; VF4-NEXT: [[TMP38:%.*]] = insertelement <4 x i64> poison, i64 [[TMP34]], i32 0
+; VF4-NEXT: [[TMP39:%.*]] = insertelement <4 x i64> [[TMP38]], i64 [[TMP35]], i32 1
+; VF4-NEXT: [[TMP40:%.*]] = insertelement <4 x i64> [[TMP39]], i64 [[TMP36]], i32 2
+; VF4-NEXT: [[TMP41:%.*]] = insertelement <4 x i64> [[TMP40]], i64 [[TMP37]], i32 3
+; VF4-NEXT: [[TMP42:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP41]]
+; VF4-NEXT: [[TMP43:%.*]] = extractelement <4 x i64> [[TMP42]], i32 0
+; VF4-NEXT: store i64 [[TMP43]], ptr [[TMP27]], align 8
+; VF4-NEXT: [[TMP44:%.*]] = extractelement <4 x i64> [[TMP42]], i32 1
+; VF4-NEXT: store i64 [[TMP44]], ptr [[TMP29]], align 8
+; VF4-NEXT: [[TMP45:%.*]] = extractelement <4 x i64> [[TMP42]], i32 2
+; VF4-NEXT: store i64 [[TMP45]], ptr [[TMP31]], align 8
+; VF4-NEXT: [[TMP46:%.*]] = extractelement <4 x i64> [[TMP42]], i32 3
+; VF4-NEXT: store i64 [[TMP46]], ptr [[TMP33]], align 8
+; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT: br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4: [[MIDDLE_BLOCK]]:
+; VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4: [[SCALAR_PH]]:
+; VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT: br label %[[LOOP:.*]]
+; VF4: [[LOOP]]:
+; VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT: [[TMP48:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP48]]
+; VF4-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT: store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT: [[TMP49:%.*]] = or disjoint i64 [[TMP48]], 1
+; VF4-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP49]]
+; VF4-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+ %l.factor = load i64, ptr %arrayidx, align 8
+ %1 = shl nsw i64 %iv, 1
+ %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+ %l.0 = load i64, ptr %data.0, align 8
+ %mul.0 = mul i64 %l.factor, %l.0
+ store i64 %mul.0, ptr %data.0, align 8
+ %3 = or disjoint i64 %1, 1
+ %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+ %l.1 = load i64, ptr %data.1, align 8
+ %mul.1 = mul i64 %l.factor, %l.1
+ store i64 %mul.1, ptr %data.1, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64_interleave_loads_order_flipped(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT: [[ENTRY:.*]]:
+; VF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2: [[VECTOR_PH]]:
+; VF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2: [[VECTOR_BODY]]:
+; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT: [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; VF2-NEXT: [[TMP12:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP13]]
+; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP15]]
+; VF2-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP14]], align 8
+; VF2-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 8
+; VF2-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[TMP17]], i32 0
+; VF2-NEXT: [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP18]], i32 1
+; VF2-NEXT: [[TMP21:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP20]]
+; VF2-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP21]], i32 0
+; VF2-NEXT: store i64 [[TMP22]], ptr [[TMP5]], align 8
+; VF2-NEXT: [[TMP23:%.*]] = extractelement <2 x i64> [[TMP21]], i32 1
+; VF2-NEXT: store i64 [[TMP23]], ptr [[TMP7]], align 8
+; VF2-NEXT: [[TMP24:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT: [[TMP25:%.*]] = extractelement <2 x i64> [[TMP24]], i32 0
+; VF2-NEXT: store i64 [[TMP25]], ptr [[TMP14]], align 8
+; VF2-NEXT: [[TMP26:%.*]] = extractelement <2 x i64> [[TMP24]], i32 1
+; VF2-NEXT: store i64 [[TMP26]], ptr [[TMP16]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT: br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2: [[MIDDLE_BLOCK]]:
+; VF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2: [[SCALAR_PH]]:
+; VF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT: br label %[[LOOP:.*]]
+; VF2: [[LOOP]]:
+; VF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT: [[TMP28:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF2-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT: [[TMP29:%.*]] = or disjoint i64 [[TMP28]], 1
+; VF2-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP29]]
+; VF2-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT: store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
+;
+; VF4-LABEL: define void @test_2xi64_interleave_loads_order_flipped(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT: [[ENTRY:.*]]:
+; VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4: [[VECTOR_PH]]:
+; VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF4: [[VECTOR_BODY]]:
+; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT: [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; VF4-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; VF4-NEXT: [[TMP20:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
+; VF4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP21]]
+; VF4-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
+; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP23]]
+; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
+; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP25]]
+; VF4-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; VF4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP27]]
+; VF4-NEXT: [[TMP29:%.*]] = load i64, ptr [[TMP22]], align 8
+; VF4-NEXT: [[TMP30:%.*]] = load i64, ptr [[TMP24]], align 8
+; VF4-NEXT: [[TMP31:%.*]] = load i64, ptr [[TMP26]], align 8
+; VF4-NEXT: [[TMP32:%.*]] = load i64, ptr [[TMP28]], align 8
+; VF4-NEXT: [[TMP33:%.*]] = insertelement <4 x i64> poison, i64 [[TMP29]], i32 0
+; VF4-NEXT: [[TMP34:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP30]], i32 1
+; VF4-NEXT: [[TMP35:%.*]] = insertelement <4 x i64> [[TMP34]], i64 [[TMP31]], i32 2
+; VF4-NEXT: [[TMP36:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP32]], i32 3
+; VF4-NEXT: [[TMP37:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP36]]
+; VF4-NEXT: [[TMP38:%.*]] = extractelement <4 x i64> [[TMP37]], i32 0
+; VF4-NEXT: store i64 [[TMP38]], ptr [[TMP5]], align 8
+; VF4-NEXT: [[TMP39:%.*]] = extractelement <4 x i64> [[TMP37]], i32 1
+; VF4-NEXT: store i64 [[TMP39]], ptr [[TMP7]], align 8
+; VF4-NEXT: [[TMP40:%.*]] = extractelement <4 x i64> [[TMP37]], i32 2
+; VF4-NEXT: store i64 [[TMP40]], ptr [[TMP9]], align 8
+; VF4-NEXT: [[TMP41:%.*]] = extractelement <4 x i64> [[TMP37]], i32 3
+; VF4-NEXT: store i64 [[TMP41]], ptr [[TMP11]], align 8
+; VF4-NEXT: [[TMP42:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
+; VF4-NEXT: [[TMP43:%.*]] = extractelement <4 x i64> [[TMP42]], i32 0
+; VF4-NEXT: store i64 [[TMP43]], ptr [[TMP22]], align 8
+; VF4-NEXT: [[TMP44:%.*]] = extractelement <4 x i64> [[TMP42]], i32 1
+; VF4-NEXT: store i64 [[TMP44]], ptr [[TMP24]], align 8
+; VF4-NEXT: [[TMP45:%.*]] = extractelement <4 x i64> [[TMP42]], i32 2
+; VF4-NEXT: store i64 [[TMP45]], ptr [[TMP26]], align 8
+; VF4-NEXT: [[TMP46:%.*]] = extractelement <4 x i64> [[TMP42]], i32 3
+; VF4-NEXT: store i64 [[TMP46]], ptr [[TMP28]], align 8
+; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT: br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4: [[MIDDLE_BLOCK]]:
+; VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4: [[SCALAR_PH]]:
+; VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT: br label %[[LOOP:.*]]
+; VF4: [[LOOP]]:
+; VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT: [[TMP48:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP48]]
+; VF4-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT: [[TMP49:%.*]] = or disjoint i64 [[TMP48]], 1
+; VF4-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP49]]
+; VF4-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT: store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+ %l.factor = load i64, ptr %arrayidx, align 8
+ %1 = shl nsw i64 %iv, 1
+ %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+ %l.0 = load i64, ptr %data.0, align 8
+ %3 = or disjoint i64 %1, 1
+ %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+ %l.1 = load i64, ptr %data.1, align 8
+ %mul.0 = mul i64 %l.factor, %l.1
+ store i64 %mul.0, ptr %data.0, align 8
+ %mul.1 = mul i64 %l.factor, %l.0
+ store i64 %mul.1, ptr %data.1, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64_store_order_flipped_1(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT: [[ENTRY:.*]]:
+; VF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2: [[VECTOR_PH]]:
+; VF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2: [[VECTOR_BODY]]:
+; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT: [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; VF2-NEXT: [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT: [[TMP13:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]]
+; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP16]]
+; VF2-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP15]], align 8
+; VF2-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP17]], align 8
+; VF2-NEXT: [[TMP20:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i32 0
+; VF2-NEXT: [[TMP21:%.*]] = insertelement <2 x i64> [[TMP20]], i64 [[TMP19]], i32 1
+; VF2-NEXT: [[TMP22:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP21]]
+; VF2-NEXT: [[TMP23:%.*]] = extractelement <2 x i64> [[TMP22]], i32 0
+; VF2-NEXT: store i64 [[TMP23]], ptr [[TMP5]], align 8
+; VF2-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP22]], i32 1
+; VF2-NEXT: store i64 [[TMP24]], ptr [[TMP7]], align 8
+; VF2-NEXT: [[TMP25:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT: store i64 [[TMP25]], ptr [[TMP15]], align 8
+; VF2-NEXT: [[TMP26:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT: store i64 [[TMP26]], ptr [[TMP17]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT: br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2: [[MIDDLE_BLOCK]]:
+; VF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2: [[SCALAR_PH]]:
+; VF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT: br label %[[LOOP:.*]]
+; VF2: [[LOOP]]:
+; VF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT: [[TMP28:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF2-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT: [[TMP29:%.*]] = or disjoint i64 [[TMP28]], 1
+; VF2-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP29]]
+; VF2-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT: store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; VF2-NEXT: store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; VF2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
+;
+; VF4-LABEL: define void @test_2xi64_store_order_flipped_1(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT: [[ENTRY:.*]]:
+; VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4: [[VECTOR_PH]]:
+; VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF4: [[VECTOR_BODY]]:
+; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT: [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; VF4-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; VF4-NEXT: [[TMP20:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
+; VF4-NEXT: [[TMP21:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP22]]
+; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP24]]
+; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
+; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF4-NEXT: [[TMP30:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF4-NEXT: [[TMP31:%.*]] = load i64, ptr [[TMP25]], align 8
+; VF4-NEXT: [[TMP32:%.*]] = load i64, ptr [[TMP27]], align 8
+; VF4-NEXT: [[TMP33:%.*]] = load i64, ptr [[TMP29]], align 8
+; VF4-NEXT: [[TMP34:%.*]] = insertelement <4 x i64> poison, i64 [[TMP30]], i32 0
+; VF4-NEXT: [[TMP35:%.*]] = insertelement <4 x i64> [[TMP34]], i64 [[TMP31]], i32 1
+; VF4-NEXT: [[TMP36:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP32]], i32 2
+; VF4-NEXT: [[TMP37:%.*]] = insertelement <4 x i64> [[TMP36]], i64 [[TMP33]], i32 3
+; VF4-NEXT: [[TMP38:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP37]]
+; VF4-NEXT: [[TMP39:%.*]] = extractelement <4 x i64> [[TMP38]], i32 0
+; VF4-NEXT: store i64 [[TMP39]], ptr [[TMP5]], align 8
+; VF4-NEXT: [[TMP40:%.*]] = extractelement <4 x i64> [[TMP38]], i32 1
+; VF4-NEXT: store i64 [[TMP40]], ptr [[TMP7]], align 8
+; VF4-NEXT: [[TMP41:%.*]] = extractelement <4 x i64> [[TMP38]], i32 2
+; VF4-NEXT: store i64 [[TMP41]], ptr [[TMP9]], align 8
+; VF4-NEXT: [[TMP42:%.*]] = extractelement <4 x i64> [[TMP38]], i32 3
+; VF4-NEXT: store i64 [[TMP42]], ptr [[TMP11]], align 8
+; VF4-NEXT: [[TMP43:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
+; VF4-NEXT: store i64 [[TMP43]], ptr [[TMP23]], align 8
+; VF4-NEXT: [[TMP44:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
+; VF4-NEXT: store i64 [[TMP44]], ptr [[TMP25]], align 8
+; VF4-NEXT: [[TMP45:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
+; VF4-NEXT: store i64 [[TMP45]], ptr [[TMP27]], align 8
+; VF4-NEXT: [[TMP46:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; VF4-NEXT: store i64 [[TMP46]], ptr [[TMP29]], align 8
+; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT: br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4: [[MIDDLE_BLOCK]]:
+; VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4: [[SCALAR_PH]]:
+; VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT: br label %[[LOOP:.*]]
+; VF4: [[LOOP]]:
+; VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT: [[TMP48:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP48]]
+; VF4-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT: [[TMP49:%.*]] = or disjoint i64 [[TMP48]], 1
+; VF4-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP49]]
+; VF4-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT: store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; VF4-NEXT: store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; VF4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+ %l.factor = load i64, ptr %arrayidx, align 8
+ %1 = shl nsw i64 %iv, 1
+ %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+ %l.0 = load i64, ptr %data.0, align 8
+ %mul.0 = mul i64 %l.factor, %l.0
+ %3 = or disjoint i64 %1, 1
+ %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+ %l.1 = load i64, ptr %data.1, align 8
+ %mul.1 = mul i64 %l.factor, %l.1
+ store i64 %mul.1, ptr %data.0, align 8
+ store i64 %mul.0, ptr %data.1, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64_store_order_flipped_2(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT: [[ENTRY:.*]]:
+; VF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2: [[VECTOR_PH]]:
+; VF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2: [[VECTOR_BODY]]:
+; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT: [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; VF2-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; VF2-NEXT: [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT: [[TMP13:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]]
+; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP16]]
+; VF2-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP15]], align 8
+; VF2-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP17]], align 8
+; VF2-NEXT: [[TMP20:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i32 0
+; VF2-NEXT: [[TMP21:%.*]] = insertelement <2 x i64> [[TMP20]], i64 [[TMP19]], i32 1
+; VF2-NEXT: [[TMP22:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP21]]
+; VF2-NEXT: [[TMP23:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT: store i64 [[TMP23]], ptr [[TMP15]], align 8
+; VF2-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT: store i64 [[TMP24]], ptr [[TMP17]], align 8
+; VF2-NEXT: [[TMP25:%.*]] = extractelement <2 x i64> [[TMP22]], i32 0
+; VF2-NEXT: store i64 [[TMP25]], ptr [[TMP5]], align 8
+; VF2-NEXT: [[TMP26:%.*]] = extractelement <2 x i64> [[TMP22]], i32 1
+; VF2-NEXT: store i64 [[TMP26]], ptr [[TMP7]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT: br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF2: [[MIDDLE_BLOCK]]:
+; VF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2: [[SCALAR_PH]]:
+; VF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT: br label %[[LOOP:.*]]
+; VF2: [[LOOP]]:
+; VF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT: [[TMP28:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF2-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT: [[TMP29:%.*]] = or disjoint i64 [[TMP28]], 1
+; VF2-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP29]]
+; VF2-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT: store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; VF2-NEXT: store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; VF2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
+;
+; VF4-LABEL: define void @test_2xi64_store_order_flipped_2(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT: [[ENTRY:.*]]:
+; VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4: [[VECTOR_PH]]:
+; VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF4: [[VECTOR_BODY]]:
+; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT: [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; VF4-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; VF4-NEXT: [[TMP20:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
+; VF4-NEXT: [[TMP21:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP22]]
+; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP24]]
+; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
+; VF4-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF4-NEXT: [[TMP30:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF4-NEXT: [[TMP31:%.*]] = load i64, ptr [[TMP25]], align 8
+; VF4-NEXT: [[TMP32:%.*]] = load i64, ptr [[TMP27]], align 8
+; VF4-NEXT: [[TMP33:%.*]] = load i64, ptr [[TMP29]], align 8
+; VF4-NEXT: [[TMP34:%.*]] = insertelement <4 x i64> poison, i64 [[TMP30]], i32 0
+; VF4-NEXT: [[TMP35:%.*]] = insertelement <4 x i64> [[TMP34]], i64 [[TMP31]], i32 1
+; VF4-NEXT: [[TMP36:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP32]], i32 2
+; VF4-NEXT: [[TMP37:%.*]] = insertelement <4 x i64> [[TMP36]], i64 [[TMP33]], i32 3
+; VF4-NEXT: [[TMP38:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP37]]
+; VF4-NEXT: [[TMP39:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
+; VF4-NEXT: store i64 [[TMP39]], ptr [[TMP23]], align 8
+; VF4-NEXT: [[TMP40:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
+; VF4-NEXT: store i64 [[TMP40]], ptr [[TMP25]], align 8
+; VF4-NEXT: [[TMP41:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
+; VF4-NEXT: store i64 [[TMP41]], ptr [[TMP27]], align 8
+; VF4-NEXT: [[TMP42:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; VF4-NEXT: store i64 [[TMP42]], ptr [[TMP29]], align 8
+; VF4-NEXT: [[TMP43:%.*]] = extractelement <4 x i64> [[TMP38]], i32 0
+; VF4-NEXT: store i64 [[TMP43]], ptr [[TMP5]], align 8
+; VF4-NEXT: [[TMP44:%.*]] = extractelement <4 x i64> [[TMP38]], i32 1
+; VF4-NEXT: store i64 [[TMP44]], ptr [[TMP7]], align 8
+; VF4-NEXT: [[TMP45:%.*]] = extractelement <4 x i64> [[TMP38]], i32 2
+; VF4-NEXT: store i64 [[TMP45]], ptr [[TMP9]], align 8
+; VF4-NEXT: [[TMP46:%.*]] = extractelement <4 x i64> [[TMP38]], i32 3
+; VF4-NEXT: store i64 [[TMP46]], ptr [[TMP11]], align 8
+; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT: br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF4: [[MIDDLE_BLOCK]]:
+; VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4: [[SCALAR_PH]]:
+; VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT: br label %[[LOOP:.*]]
+; VF4: [[LOOP]]:
+; VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT: [[TMP48:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP48]]
+; VF4-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT: [[TMP49:%.*]] = or disjoint i64 [[TMP48]], 1
+; VF4-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP49]]
+; VF4-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT: store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; VF4-NEXT: store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; VF4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+ %l.factor = load i64, ptr %arrayidx, align 8
+ %1 = shl nsw i64 %iv, 1
+ %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+ %l.0 = load i64, ptr %data.0, align 8
+ %mul.0 = mul i64 %l.factor, %l.0
+ %3 = or disjoint i64 %1, 1
+ %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+ %l.1 = load i64, ptr %data.1, align 8
+ %mul.1 = mul i64 %l.factor, %l.1
+ store i64 %mul.0, ptr %data.1, align 8
+ store i64 %mul.1, ptr %data.0, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noalias %src.0, ptr noalias %src.1, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64_different_loads_feeding_fmul(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[SRC_0:%.*]], ptr noalias [[SRC_1:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT: [[ENTRY:.*]]:
+; VF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2: [[VECTOR_PH]]:
+; VF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2: [[VECTOR_BODY]]:
+; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[TMP0]]
+; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT: [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT: [[TMP8:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF2-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0
+; VF2-NEXT: store i64 [[TMP9]], ptr [[TMP5]], align 8
+; VF2-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1
+; VF2-NEXT: store i64 [[TMP10]], ptr [[TMP7]], align 8
+; VF2-NEXT: [[TMP11:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; VF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP12]]
+; VF2-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; VF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]]
+; VF2-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF2-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 8
+; VF2-NEXT: [[TMP18:%.*]] = insertelement <2 x i64> poison, i64 [[TMP16]], i32 0
+; VF2-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> [[TMP18]], i64 [[TMP17]], i32 1
+; VF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[TMP0]]
+; VF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
+; VF2-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP21]], align 8
+; VF2-NEXT: [[TMP22:%.*]] = mul <2 x i64> [[WIDE_LOAD2]], [[TMP19]]
+; VF2-NEXT: [[TMP23:%.*]] = extractelement <2 x i64> [[TMP22]], i32 0
+; VF2-NEXT: store i64 [[TMP23]], ptr [[TMP13]], align 8
+; VF2-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP22]], i32 1
+; VF2-NEXT: store i64 [[TMP24]], ptr [[TMP15]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF2: [[MIDDLE_BLOCK]]:
+; VF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2: [[SCALAR_PH]]:
+; VF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT: br label %[[LOOP:.*]]
+; VF2: [[LOOP]]:
+; VF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV]]
+; VF2-NEXT: [[L_SRC_0:%.*]] = load i64, ptr [[GEP_SRC_0]], align 8
+; VF2-NEXT: [[TMP26:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
+; VF2-NEXT: [[L_0:%.*]] = load i64, ptr [[GEP_SRC_0]], align 8
+; VF2-NEXT: [[MUL_0:%.*]] = mul i64 [[L_SRC_0]], [[L_0]]
+; VF2-NEXT: store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT: [[TMP27:%.*]] = or disjoint i64 [[TMP26]], 1
+; VF2-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP27]]
+; VF2-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV]]
+; VF2-NEXT: [[L_SRC_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8
+; VF2-NEXT: [[MUL_1:%.*]] = mul i64 [[L_SRC_1]], [[L_1]]
+; VF2-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
+;
+; VF4-LABEL: define void @test_2xi64_different_loads_feeding_fmul(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[SRC_0:%.*]], ptr noalias [[SRC_1:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT: [[ENTRY:.*]]:
+; VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4: [[VECTOR_PH]]:
+; VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF4: [[VECTOR_BODY]]:
+; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[TMP0]]
+; VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT: [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT: [[TMP12:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP12]], i32 0
+; VF4-NEXT: store i64 [[TMP13]], ptr [[TMP5]], align 8
+; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP12]], i32 1
+; VF4-NEXT: store i64 [[TMP14]], ptr [[TMP7]], align 8
+; VF4-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[TMP12]], i32 2
+; VF4-NEXT: store i64 [[TMP15]], ptr [[TMP9]], align 8
+; VF4-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP12]], i32 3
+; VF4-NEXT: store i64 [[TMP16]], ptr [[TMP11]], align 8
+; VF4-NEXT: [[TMP17:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
+; VF4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]]
+; VF4-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
+; VF4-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP20]]
+; VF4-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
+; VF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP22]]
+; VF4-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
+; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP24]]
+; VF4-NEXT: [[TMP26:%.*]] = load i64, ptr [[TMP19]], align 8
+; VF4-NEXT: [[TMP27:%.*]] = load i64, ptr [[TMP21]], align 8
+; VF4-NEXT: [[TMP28:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF4-NEXT: [[TMP29:%.*]] = load i64, ptr [[TMP25]], align 8
+; VF4-NEXT: [[TMP30:%.*]] = insertelement <4 x i64> poison, i64 [[TMP26]], i32 0
+; VF4-NEXT: [[TMP31:%.*]] = insertelement <4 x i64> [[TMP30]], i64 [[TMP27]], i32 1
+; VF4-NEXT: [[TMP32:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP28]], i32 2
+; VF4-NEXT: [[TMP33:%.*]] = insertelement <4 x i64> [[TMP32]], i64 [[TMP29]], i32 3
+; VF4-NEXT: [[TMP34:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[TMP0]]
+; VF4-NEXT: [[TMP35:%.*]] = getelementptr inbounds i64, ptr [[TMP34]], i32 0
+; VF4-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP35]], align 8
+; VF4-NEXT: [[TMP36:%.*]] = mul <4 x i64> [[WIDE_LOAD2]], [[TMP33]]
+; VF4-NEXT: [[TMP37:%.*]] = extractelement <4 x i64> [[TMP36]], i32 0
+; VF4-NEXT: store i64 [[TMP37]], ptr [[TMP19]], align 8
+; VF4-NEXT: [[TMP38:%.*]] = extractelement <4 x i64> [[TMP36]], i32 1
+; VF4-NEXT: store i64 [[TMP38]], ptr [[TMP21]], align 8
+; VF4-NEXT: [[TMP39:%.*]] = extractelement <4 x i64> [[TMP36]], i32 2
+; VF4-NEXT: store i64 [[TMP39]], ptr [[TMP23]], align 8
+; VF4-NEXT: [[TMP40:%.*]] = extractelement <4 x i64> [[TMP36]], i32 3
+; VF4-NEXT: store i64 [[TMP40]], ptr [[TMP25]], align 8
+; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF4: [[MIDDLE_BLOCK]]:
+; VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4: [[SCALAR_PH]]:
+; VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT: br label %[[LOOP:.*]]
+; VF4: [[LOOP]]:
+; VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV]]
+; VF4-NEXT: [[L_SRC_0:%.*]] = load i64, ptr [[GEP_SRC_0]], align 8
+; VF4-NEXT: [[TMP42:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP42]]
+; VF4-NEXT: [[L_0:%.*]] = load i64, ptr [[GEP_SRC_0]], align 8
+; VF4-NEXT: [[MUL_0:%.*]] = mul i64 [[L_SRC_0]], [[L_0]]
+; VF4-NEXT: store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT: [[TMP43:%.*]] = or disjoint i64 [[TMP42]], 1
+; VF4-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP43]]
+; VF4-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV]]
+; VF4-NEXT: [[L_SRC_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8
+; VF4-NEXT: [[MUL_1:%.*]] = mul i64 [[L_SRC_1]], [[L_1]]
+; VF4-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %gep.src.0 = getelementptr inbounds i64, ptr %src.0, i64 %iv
+ %l.src.0 = load i64, ptr %gep.src.0, align 8
+ %1 = shl nsw i64 %iv, 1
+ %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+ %l.0 = load i64, ptr %gep.src.0, align 8
+ %mul.0 = mul i64 %l.src.0, %l.0
+ store i64 %mul.0, ptr %data.0, align 8
+ %3 = or disjoint i64 %1, 1
+ %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+ %l.1 = load i64, ptr %data.1, align 8
+ %gep.src.1 = getelementptr inbounds i64, ptr %src.1, i64 %iv
+ %l.src.1 = load i64, ptr %gep.src.1, align 8
+ %mul.1 = mul i64 %l.src.1, %l.1
+ store i64 %mul.1, ptr %data.1, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_3xi64(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT: [[ENTRY:.*]]:
+; VF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2: [[VECTOR_PH]]:
+; VF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2: [[VECTOR_BODY]]:
+; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF2-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i32 0
+; VF2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[TMP8]], i64 [[TMP7]], i32 1
+; VF2-NEXT: [[TMP10:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP9]]
+; VF2-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0
+; VF2-NEXT: store i64 [[TMP11]], ptr [[TMP4]], align 8
+; VF2-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP10]], i32 1
+; VF2-NEXT: store i64 [[TMP12]], ptr [[TMP5]], align 8
+; VF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF2-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF2-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 8
+; VF2-NEXT: [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[TMP15]], i32 0
+; VF2-NEXT: [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[TMP16]], i32 1
+; VF2-NEXT: [[TMP19:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP18]]
+; VF2-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP19]], i32 0
+; VF2-NEXT: store i64 [[TMP20]], ptr [[TMP13]], align 8
+; VF2-NEXT: [[TMP21:%.*]] = extractelement <2 x i64> [[TMP19]], i32 1
+; VF2-NEXT: store i64 [[TMP21]], ptr [[TMP14]], align 8
+; VF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 2
+; VF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 2
+; VF2-NEXT: [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 8
+; VF2-NEXT: [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF2-NEXT: [[TMP26:%.*]] = insertelement <2 x i64> poison, i64 [[TMP24]], i32 0
+; VF2-NEXT: [[TMP27:%.*]] = insertelement <2 x i64> [[TMP26]], i64 [[TMP25]], i32 1
+; VF2-NEXT: [[TMP28:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP27]]
+; VF2-NEXT: [[TMP29:%.*]] = extractelement <2 x i64> [[TMP28]], i32 0
+; VF2-NEXT: store i64 [[TMP29]], ptr [[TMP22]], align 8
+; VF2-NEXT: [[TMP30:%.*]] = extractelement <2 x i64> [[TMP28]], i32 1
+; VF2-NEXT: store i64 [[TMP30]], ptr [[TMP23]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; VF2: [[MIDDLE_BLOCK]]:
+; VF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2: [[SCALAR_PH]]:
+; VF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT: br label %[[LOOP:.*]]
+; VF2: [[LOOP]]:
+; VF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF2-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT: store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT: [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF2-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT: [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 2
+; VF2-NEXT: [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
+; VF2-NEXT: [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
+; VF2-NEXT: store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; VF2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
+;
+; VF4-LABEL: define void @test_3xi64(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT: [[ENTRY:.*]]:
+; VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4: [[VECTOR_PH]]:
+; VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF4: [[VECTOR_BODY]]:
+; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP3]], i32 0
+; VF4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i32 0
+; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP11]], i32 1
+; VF4-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 2
+; VF4-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 3
+; VF4-NEXT: [[TMP18:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP17]]
+; VF4-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0
+; VF4-NEXT: store i64 [[TMP19]], ptr [[TMP6]], align 8
+; VF4-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1
+; VF4-NEXT: store i64 [[TMP20]], ptr [[TMP7]], align 8
+; VF4-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2
+; VF4-NEXT: store i64 [[TMP21]], ptr [[TMP8]], align 8
+; VF4-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3
+; VF4-NEXT: store i64 [[TMP22]], ptr [[TMP9]], align 8
+; VF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 1
+; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP3]], i32 1
+; VF4-NEXT: [[TMP27:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF4-NEXT: [[TMP28:%.*]] = load i64, ptr [[TMP24]], align 8
+; VF4-NEXT: [[TMP29:%.*]] = load i64, ptr [[TMP25]], align 8
+; VF4-NEXT: [[TMP30:%.*]] = load i64, ptr [[TMP26]], align 8
+; VF4-NEXT: [[TMP31:%.*]] = insertelement <4 x i64> poison, i64 [[TMP27]], i32 0
+; VF4-NEXT: [[TMP32:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP28]], i32 1
+; VF4-NEXT: [[TMP33:%.*]] = insertelement <4 x i64> [[TMP32]], i64 [[TMP29]], i32 2
+; VF4-NEXT: [[TMP34:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP30]], i32 3
+; VF4-NEXT: [[TMP35:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP34]]
+; VF4-NEXT: [[TMP36:%.*]] = extractelement <4 x i64> [[TMP35]], i32 0
+; VF4-NEXT: store i64 [[TMP36]], ptr [[TMP23]], align 8
+; VF4-NEXT: [[TMP37:%.*]] = extractelement <4 x i64> [[TMP35]], i32 1
+; VF4-NEXT: store i64 [[TMP37]], ptr [[TMP24]], align 8
+; VF4-NEXT: [[TMP38:%.*]] = extractelement <4 x i64> [[TMP35]], i32 2
+; VF4-NEXT: store i64 [[TMP38]], ptr [[TMP25]], align 8
+; VF4-NEXT: [[TMP39:%.*]] = extractelement <4 x i64> [[TMP35]], i32 3
+; VF4-NEXT: store i64 [[TMP39]], ptr [[TMP26]], align 8
+; VF4-NEXT: [[TMP40:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 2
+; VF4-NEXT: [[TMP41:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 2
+; VF4-NEXT: [[TMP42:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 2
+; VF4-NEXT: [[TMP43:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP3]], i32 2
+; VF4-NEXT: [[TMP44:%.*]] = load i64, ptr [[TMP40]], align 8
+; VF4-NEXT: [[TMP45:%.*]] = load i64, ptr [[TMP41]], align 8
+; VF4-NEXT: [[TMP46:%.*]] = load i64, ptr [[TMP42]], align 8
+; VF4-NEXT: [[TMP47:%.*]] = load i64, ptr [[TMP43]], align 8
+; VF4-NEXT: [[TMP48:%.*]] = insertelement <4 x i64> poison, i64 [[TMP44]], i32 0
+; VF4-NEXT: [[TMP49:%.*]] = insertelement <4 x i64> [[TMP48]], i64 [[TMP45]], i32 1
+; VF4-NEXT: [[TMP50:%.*]] = insertelement <4 x i64> [[TMP49]], i64 [[TMP46]], i32 2
+; VF4-NEXT: [[TMP51:%.*]] = insertelement <4 x i64> [[TMP50]], i64 [[TMP47]], i32 3
+; VF4-NEXT: [[TMP52:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP51]]
+; VF4-NEXT: [[TMP53:%.*]] = extractelement <4 x i64> [[TMP52]], i32 0
+; VF4-NEXT: store i64 [[TMP53]], ptr [[TMP40]], align 8
+; VF4-NEXT: [[TMP54:%.*]] = extractelement <4 x i64> [[TMP52]], i32 1
+; VF4-NEXT: store i64 [[TMP54]], ptr [[TMP41]], align 8
+; VF4-NEXT: [[TMP55:%.*]] = extractelement <4 x i64> [[TMP52]], i32 2
+; VF4-NEXT: store i64 [[TMP55]], ptr [[TMP42]], align 8
+; VF4-NEXT: [[TMP56:%.*]] = extractelement <4 x i64> [[TMP52]], i32 3
+; VF4-NEXT: store i64 [[TMP56]], ptr [[TMP43]], align 8
+; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT: [[TMP57:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT: br i1 [[TMP57]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; VF4: [[MIDDLE_BLOCK]]:
+; VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4: [[SCALAR_PH]]:
+; VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT: br label %[[LOOP:.*]]
+; VF4: [[LOOP]]:
+; VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF4-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT: store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT: [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF4-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT: [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 2
+; VF4-NEXT: [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
+; VF4-NEXT: [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
+; VF4-NEXT: store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; VF4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+ %l.factor = load i64, ptr %arrayidx, align 8
+ %data.0 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 0
+ %l.0 = load i64, ptr %data.0, align 8
+ %mul.0 = mul i64 %l.factor, %l.0
+ store i64 %mul.0, ptr %data.0, align 8
+ %data.1 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 1
+ %l.1 = load i64, ptr %data.1, align 8
+ %mul.1 = mul i64 %l.factor, %l.1
+ store i64 %mul.1, ptr %data.1, align 8
+ %data.2 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 2
+ %l.2 = load i64, ptr %data.2, align 8
+ %mul.2 = mul i64 %l.factor, %l.2
+ store i64 %mul.2, ptr %data.2, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+
+define void @test_2xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi32(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT: [[ENTRY:.*]]:
+; VF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2: [[VECTOR_PH]]:
+; VF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2: [[VECTOR_BODY]]:
+; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; VF2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 8
+; VF2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 8
+; VF2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
+; VF2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
+; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 8
+; VF2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 8
+; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; VF2-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP11]], i32 1
+; VF2-NEXT: [[TMP14:%.*]] = mul <2 x i32> [[TMP7]], [[TMP13]]
+; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
+; VF2-NEXT: store i32 [[TMP15]], ptr [[TMP8]], align 8
+; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
+; VF2-NEXT: store i32 [[TMP16]], ptr [[TMP9]], align 8
+; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF2-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 8
+; VF2-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP18]], align 8
+; VF2-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i32 0
+; VF2-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i32 1
+; VF2-NEXT: [[TMP23:%.*]] = mul <2 x i32> [[TMP7]], [[TMP22]]
+; VF2-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0
+; VF2-NEXT: store i32 [[TMP24]], ptr [[TMP17]], align 8
+; VF2-NEXT: [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1
+; VF2-NEXT: store i32 [[TMP25]], ptr [[TMP18]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; VF2: [[MIDDLE_BLOCK]]:
+; VF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2: [[SCALAR_PH]]:
+; VF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT: br label %[[LOOP:.*]]
+; VF2: [[LOOP]]:
+; VF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT: [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF2-NEXT: [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; VF2-NEXT: [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT: store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT: [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF2-NEXT: [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; VF2-NEXT: [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT: store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
+;
+; VF4-LABEL: define void @test_2xi32(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT: [[ENTRY:.*]]:
+; VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4: [[VECTOR_PH]]:
+; VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF4: [[VECTOR_BODY]]:
+; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
+; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP3]]
+; VF4-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 8
+; VF4-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 8
+; VF4-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 8
+; VF4-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 8
+; VF4-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i32 0
+; VF4-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 1
+; VF4-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 2
+; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP11]], i32 3
+; VF4-NEXT: [[TMP16:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF4-NEXT: [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF4-NEXT: [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; VF4-NEXT: [[TMP19:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 0
+; VF4-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP16]], align 8
+; VF4-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP17]], align 8
+; VF4-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP18]], align 8
+; VF4-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP19]], align 8
+; VF4-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0
+; VF4-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP21]], i32 1
+; VF4-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i32 2
+; VF4-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i32 3
+; VF4-NEXT: [[TMP28:%.*]] = mul <4 x i32> [[TMP15]], [[TMP27]]
+; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP28]], i32 0
+; VF4-NEXT: store i32 [[TMP29]], ptr [[TMP16]], align 8
+; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP28]], i32 1
+; VF4-NEXT: store i32 [[TMP30]], ptr [[TMP17]], align 8
+; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[TMP28]], i32 2
+; VF4-NEXT: store i32 [[TMP31]], ptr [[TMP18]], align 8
+; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP28]], i32 3
+; VF4-NEXT: store i32 [[TMP32]], ptr [[TMP19]], align 8
+; VF4-NEXT: [[TMP33:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF4-NEXT: [[TMP34:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF4-NEXT: [[TMP35:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 1
+; VF4-NEXT: [[TMP36:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 1
+; VF4-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP33]], align 8
+; VF4-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP34]], align 8
+; VF4-NEXT: [[TMP39:%.*]] = load i32, ptr [[TMP35]], align 8
+; VF4-NEXT: [[TMP40:%.*]] = load i32, ptr [[TMP36]], align 8
+; VF4-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[TMP37]], i32 0
+; VF4-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP38]], i32 1
+; VF4-NEXT: [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP39]], i32 2
+; VF4-NEXT: [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[TMP40]], i32 3
+; VF4-NEXT: [[TMP45:%.*]] = mul <4 x i32> [[TMP15]], [[TMP44]]
+; VF4-NEXT: [[TMP46:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
+; VF4-NEXT: store i32 [[TMP46]], ptr [[TMP33]], align 8
+; VF4-NEXT: [[TMP47:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
+; VF4-NEXT: store i32 [[TMP47]], ptr [[TMP34]], align 8
+; VF4-NEXT: [[TMP48:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
+; VF4-NEXT: store i32 [[TMP48]], ptr [[TMP35]], align 8
+; VF4-NEXT: [[TMP49:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
+; VF4-NEXT: store i32 [[TMP49]], ptr [[TMP36]], align 8
+; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT: br i1 [[TMP50]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; VF4: [[MIDDLE_BLOCK]]:
+; VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4: [[SCALAR_PH]]:
+; VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT: br label %[[LOOP:.*]]
+; VF4: [[LOOP]]:
+; VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT: [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF4-NEXT: [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; VF4-NEXT: [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT: store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT: [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF4-NEXT: [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; VF4-NEXT: [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT: store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+ %l.factor = load i32 , ptr %arrayidx, align 8
+ %data.0 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 0
+ %l.0 = load i32, ptr %data.0, align 8
+ %mul.0 = mul i32 %l.factor, %l.0
+ store i32 %mul.0, ptr %data.0, align 8
+ %data.1 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 1
+ %l.1 = load i32, ptr %data.1, align 8
+ %mul.1 = mul i32 %l.factor, %l.1
+ store i32%mul.1, ptr %data.1, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_3xi32(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT: [[ENTRY:.*]]:
+; VF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2: [[VECTOR_PH]]:
+; VF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF2: [[VECTOR_BODY]]:
+; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; VF2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 8
+; VF2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 8
+; VF2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
+; VF2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
+; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 8
+; VF2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 8
+; VF2-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; VF2-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP11]], i32 1
+; VF2-NEXT: [[TMP14:%.*]] = mul <2 x i32> [[TMP7]], [[TMP13]]
+; VF2-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
+; VF2-NEXT: store i32 [[TMP15]], ptr [[TMP8]], align 8
+; VF2-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
+; VF2-NEXT: store i32 [[TMP16]], ptr [[TMP9]], align 8
+; VF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF2-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 8
+; VF2-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP18]], align 8
+; VF2-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i32 0
+; VF2-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i32 1
+; VF2-NEXT: [[TMP23:%.*]] = mul <2 x i32> [[TMP7]], [[TMP22]]
+; VF2-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0
+; VF2-NEXT: store i32 [[TMP24]], ptr [[TMP17]], align 8
+; VF2-NEXT: [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1
+; VF2-NEXT: store i32 [[TMP25]], ptr [[TMP18]], align 8
+; VF2-NEXT: [[TMP26:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 2
+; VF2-NEXT: [[TMP27:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 2
+; VF2-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP26]], align 8
+; VF2-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP27]], align 8
+; VF2-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> poison, i32 [[TMP28]], i32 0
+; VF2-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP29]], i32 1
+; VF2-NEXT: [[TMP32:%.*]] = mul <2 x i32> [[TMP7]], [[TMP31]]
+; VF2-NEXT: [[TMP33:%.*]] = extractelement <2 x i32> [[TMP32]], i32 0
+; VF2-NEXT: store i32 [[TMP33]], ptr [[TMP26]], align 8
+; VF2-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP32]], i32 1
+; VF2-NEXT: store i32 [[TMP34]], ptr [[TMP27]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT: br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF2: [[MIDDLE_BLOCK]]:
+; VF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2: [[SCALAR_PH]]:
+; VF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT: br label %[[LOOP:.*]]
+; VF2: [[LOOP]]:
+; VF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT: [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF2-NEXT: [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; VF2-NEXT: [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT: store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT: [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF2-NEXT: [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; VF2-NEXT: [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT: store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT: [[DATA_2:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 2
+; VF2-NEXT: [[L_2:%.*]] = load i32, ptr [[DATA_2]], align 8
+; VF2-NEXT: [[MUL_2:%.*]] = mul i32 [[L_FACTOR]], [[L_2]]
+; VF2-NEXT: store i32 [[MUL_2]], ptr [[DATA_2]], align 8
+; VF2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; VF2: [[EXIT]]:
+; VF2-NEXT: ret void
+;
+; VF4-LABEL: define void @test_3xi32(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT: [[ENTRY:.*]]:
+; VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4: [[VECTOR_PH]]:
+; VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF4: [[VECTOR_BODY]]:
+; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
+; VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP3]]
+; VF4-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 8
+; VF4-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 8
+; VF4-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 8
+; VF4-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 8
+; VF4-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i32 0
+; VF4-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 1
+; VF4-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 2
+; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP11]], i32 3
+; VF4-NEXT: [[TMP16:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF4-NEXT: [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF4-NEXT: [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; VF4-NEXT: [[TMP19:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 0
+; VF4-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP16]], align 8
+; VF4-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP17]], align 8
+; VF4-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP18]], align 8
+; VF4-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP19]], align 8
+; VF4-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0
+; VF4-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP21]], i32 1
+; VF4-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i32 2
+; VF4-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i32 3
+; VF4-NEXT: [[TMP28:%.*]] = mul <4 x i32> [[TMP15]], [[TMP27]]
+; VF4-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP28]], i32 0
+; VF4-NEXT: store i32 [[TMP29]], ptr [[TMP16]], align 8
+; VF4-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP28]], i32 1
+; VF4-NEXT: store i32 [[TMP30]], ptr [[TMP17]], align 8
+; VF4-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[TMP28]], i32 2
+; VF4-NEXT: store i32 [[TMP31]], ptr [[TMP18]], align 8
+; VF4-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP28]], i32 3
+; VF4-NEXT: store i32 [[TMP32]], ptr [[TMP19]], align 8
+; VF4-NEXT: [[TMP33:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF4-NEXT: [[TMP34:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF4-NEXT: [[TMP35:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 1
+; VF4-NEXT: [[TMP36:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 1
+; VF4-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP33]], align 8
+; VF4-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP34]], align 8
+; VF4-NEXT: [[TMP39:%.*]] = load i32, ptr [[TMP35]], align 8
+; VF4-NEXT: [[TMP40:%.*]] = load i32, ptr [[TMP36]], align 8
+; VF4-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[TMP37]], i32 0
+; VF4-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP38]], i32 1
+; VF4-NEXT: [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP39]], i32 2
+; VF4-NEXT: [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[TMP40]], i32 3
+; VF4-NEXT: [[TMP45:%.*]] = mul <4 x i32> [[TMP15]], [[TMP44]]
+; VF4-NEXT: [[TMP46:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
+; VF4-NEXT: store i32 [[TMP46]], ptr [[TMP33]], align 8
+; VF4-NEXT: [[TMP47:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
+; VF4-NEXT: store i32 [[TMP47]], ptr [[TMP34]], align 8
+; VF4-NEXT: [[TMP48:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
+; VF4-NEXT: store i32 [[TMP48]], ptr [[TMP35]], align 8
+; VF4-NEXT: [[TMP49:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
+; VF4-NEXT: store i32 [[TMP49]], ptr [[TMP36]], align 8
+; VF4-NEXT: [[TMP50:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 2
+; VF4-NEXT: [[TMP51:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 2
+; VF4-NEXT: [[TMP52:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 2
+; VF4-NEXT: [[TMP53:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 2
+; VF4-NEXT: [[TMP54:%.*]] = load i32, ptr [[TMP50]], align 8
+; VF4-NEXT: [[TMP55:%.*]] = load i32, ptr [[TMP51]], align 8
+; VF4-NEXT: [[TMP56:%.*]] = load i32, ptr [[TMP52]], align 8
+; VF4-NEXT: [[TMP57:%.*]] = load i32, ptr [[TMP53]], align 8
+; VF4-NEXT: [[TMP58:%.*]] = insertelement <4 x i32> poison, i32 [[TMP54]], i32 0
+; VF4-NEXT: [[TMP59:%.*]] = insertelement <4 x i32> [[TMP58]], i32 [[TMP55]], i32 1
+; VF4-NEXT: [[TMP60:%.*]] = insertelement <4 x i32> [[TMP59]], i32 [[TMP56]], i32 2
+; VF4-NEXT: [[TMP61:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP57]], i32 3
+; VF4-NEXT: [[TMP62:%.*]] = mul <4 x i32> [[TMP15]], [[TMP61]]
+; VF4-NEXT: [[TMP63:%.*]] = extractelement <4 x i32> [[TMP62]], i32 0
+; VF4-NEXT: store i32 [[TMP63]], ptr [[TMP50]], align 8
+; VF4-NEXT: [[TMP64:%.*]] = extractelement <4 x i32> [[TMP62]], i32 1
+; VF4-NEXT: store i32 [[TMP64]], ptr [[TMP51]], align 8
+; VF4-NEXT: [[TMP65:%.*]] = extractelement <4 x i32> [[TMP62]], i32 2
+; VF4-NEXT: store i32 [[TMP65]], ptr [[TMP52]], align 8
+; VF4-NEXT: [[TMP66:%.*]] = extractelement <4 x i32> [[TMP62]], i32 3
+; VF4-NEXT: store i32 [[TMP66]], ptr [[TMP53]], align 8
+; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT: [[TMP67:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT: br i1 [[TMP67]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF4: [[MIDDLE_BLOCK]]:
+; VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4: [[SCALAR_PH]]:
+; VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT: br label %[[LOOP:.*]]
+; VF4: [[LOOP]]:
+; VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT: [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF4-NEXT: [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; VF4-NEXT: [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT: store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT: [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF4-NEXT: [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; VF4-NEXT: [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT: store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT: [[DATA_2:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 2
+; VF4-NEXT: [[L_2:%.*]] = load i32, ptr [[DATA_2]], align 8
+; VF4-NEXT: [[MUL_2:%.*]] = mul i32 [[L_FACTOR]], [[L_2]]
+; VF4-NEXT: store i32 [[MUL_2]], ptr [[DATA_2]], align 8
+; VF4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; VF4: [[EXIT]]:
+; VF4-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+ %l.factor = load i32 , ptr %arrayidx, align 8
+ %data.0 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 0
+ %l.0 = load i32, ptr %data.0, align 8
+ %mul.0 = mul i32 %l.factor, %l.0
+ store i32 %mul.0, ptr %data.0, align 8
+ %data.1 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 1
+ %l.1 = load i32, ptr %data.1, align 8
+ %mul.1 = mul i32 %l.factor, %l.1
+ store i32%mul.1, ptr %data.1, align 8
+ %data.2 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 2
+ %l.2 = load i32, ptr %data.2, align 8
+ %mul.2 = mul i32 %l.factor, %l.2
+ store i32 %mul.2, ptr %data.2, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+;.
+; VF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; VF2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; VF2: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; VF2: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; VF2: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; VF2: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; VF2: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; VF2: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; VF2: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; VF2: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; VF2: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; VF2: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; VF2: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.
+; VF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF4: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; VF4: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; VF4: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; VF4: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; VF4: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; VF4: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; VF4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; VF4: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; VF4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; VF4: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; VF4: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; VF4: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; VF4: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.
>From 8252b0cc74c6c1ae7cdc28af51d7a5f82c5bbb7a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 21 Jul 2024 11:24:55 +0100
Subject: [PATCH 3/7] [VPlan] Add transformation to narrow interleave groups.
This patch adds a new narrowInterleaveGroups transfrom, which tries
convert a plan with interleave groups with VF elements to a plan that
instead replaces the interleave groups with wide loads and stores
processing VF elements.
This effectively is a very simple form of loop-aware SLP, where we
use interleave groups to identify candidates.
This initial version is quite restricted and hopefully serves as a
starting point for how to best model those kinds of transforms.
Depends on https://github.com/llvm/llvm-project/pull/106431.
Fixes https://github.com/llvm/llvm-project/issues/82936
---
.../Transforms/Vectorize/LoopVectorize.cpp | 3 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 4 +
.../Transforms/Vectorize/VPlanTransforms.cpp | 128 ++++++++++++++++++
.../Transforms/Vectorize/VPlanTransforms.h | 2 +
...sform-narrow-interleave-to-widen-memory.ll | 74 ++++------
5 files changed, 162 insertions(+), 49 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 30fadc2c939418..cb55b6636fc279 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7507,6 +7507,9 @@ LoopVectorizationPlanner::executePlan(
OrigLoop->getHeader()->getModule()->getContext());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
+ if (VPlanTransforms::narrowInterleaveGroups(BestVPlan, BestVF)) {
+ LLVM_DEBUG(dbgs() << "Narrowed interleave\n");
+ }
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
<< ", UF=" << BestUF << '\n');
BestVPlan.setName("Final VPlan");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1a039e0a736145..96b16164738181 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -48,6 +48,9 @@ extern cl::opt<unsigned> ForceTargetInstructionCost;
bool VPRecipeBase::mayWriteToMemory() const {
switch (getVPDefID()) {
+ case VPInstructionSC: {
+ return !Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode());
+ }
case VPInterleaveSC:
return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
case VPWidenStoreEVLSC:
@@ -63,6 +66,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPBranchOnMaskSC:
case VPScalarIVStepsSC:
case VPPredInstPHISC:
+ case VPVectorPointerSC:
return false;
case VPBlendSC:
case VPReductionEVLSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3b7f066f0636f3..285c5be7a105d5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -668,6 +668,7 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
unsigned BestUF,
PredicatedScalarEvolution &PSE) {
+
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
VPBasicBlock *ExitingVPBB =
@@ -710,6 +711,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
// TODO: Further simplifications are possible
// 1. Replace inductions with constants.
// 2. Replace vector loop region with VPBasicBlock.
+ //
}
/// Sink users of \p FOR after the recipe defining the previous value \p
@@ -1657,3 +1659,129 @@ void VPlanTransforms::createInterleaveGroups(
}
}
}
+
+static bool supportedLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) {
+ if (auto *W = dyn_cast_or_null<VPWidenLoadRecipe>(V->getDefiningRecipe())) {
+ if (W->getMask())
+ return false;
+ return !W->getMask() && (R0->getOperand(0) == V || R0->getOperand(1) == V);
+ }
+
+ if (auto *IR = dyn_cast_or_null<VPInterleaveRecipe>(V->getDefiningRecipe())) {
+ return IR->getInterleaveGroup()->getFactor() ==
+ IR->getInterleaveGroup()->getNumMembers() &&
+ IR->getVPValue(Idx) == V;
+ }
+ return false;
+}
+
+/// Returns true of \p IR is a consecutive interleave group with \p VF members.
+static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *IR,
+ ElementCount VF) {
+ if (!IR)
+ return false;
+ auto IG = IR->getInterleaveGroup();
+ return IG->getFactor() == IG->getNumMembers() &&
+ IG->getNumMembers() == VF.getKnownMinValue();
+}
+
+bool VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
+ using namespace llvm::VPlanPatternMatch;
+ if (VF.isScalable())
+ return false;
+
+ bool Changed = false;
+ SmallVector<VPInterleaveRecipe *> StoreGroups;
+ for (auto &R : make_early_inc_range(
+ *Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
+ if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
+ isa<VPCanonicalIVPHIRecipe>(&R))
+ continue;
+
+ // Bail out on recipes not supported at the moment:
+ // * phi recipes other than the canonical induction
+ // * recipes writing to memory except interleave groups
+ // Only support plans with a canonical induction phi.
+ if ((R.isPhi() && !isa<VPCanonicalIVPHIRecipe>(&R)) ||
+ (R.mayWriteToMemory() && !isa<VPInterleaveRecipe>(&R)))
+ return false;
+
+ auto *IR = dyn_cast<VPInterleaveRecipe>(&R);
+ if (!IR)
+ continue;
+
+ if (!isConsecutiveInterleaveGroup(IR, VF))
+ return false;
+ if (IR->getStoredValues().empty())
+ continue;
+
+ auto *Lane0 = dyn_cast_or_null<VPWidenRecipe>(
+ IR->getStoredValues()[0]->getDefiningRecipe());
+ if (!Lane0)
+ return false;
+ for (const auto &[I, V] : enumerate(IR->getStoredValues())) {
+ auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe());
+ if (!R || R->getOpcode() != Lane0->getOpcode())
+ return false;
+ // Work around captured structured bindings being a C++20 extension.
+ auto Idx = I;
+ if (any_of(R->operands(), [Lane0, Idx](VPValue *V) {
+ return !supportedLoad(Lane0, V, Idx);
+ }))
+ return false;
+ }
+
+ StoreGroups.push_back(IR);
+ }
+
+ // Narrow operation tree rooted at store groups.
+ for (auto *StoreGroup : StoreGroups) {
+ auto *Lane0 = cast<VPWidenRecipe>(
+ StoreGroup->getStoredValues()[0]->getDefiningRecipe());
+
+ unsigned LoadGroupIdx =
+ isa<VPInterleaveRecipe>(Lane0->getOperand(1)->getDefiningRecipe()) ? 1
+ : 0;
+ unsigned WideLoadIdx = 1 - LoadGroupIdx;
+ auto *LoadGroup = cast<VPInterleaveRecipe>(
+ Lane0->getOperand(LoadGroupIdx)->getDefiningRecipe());
+
+ auto *WideLoad = cast<VPWidenLoadRecipe>(
+ Lane0->getOperand(WideLoadIdx)->getDefiningRecipe());
+
+ // Narrow wide load to uniform scalar load, as transformed VPlan will only
+ // process one original iteration.
+ auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(),
+ WideLoad->operands(), true);
+ // Narrow interleave group to wide load, as transformed VPlan will only
+ // process one original iteration.
+ auto *L = new VPWidenLoadRecipe(
+ *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
+ LoadGroup->getAddr(), LoadGroup->getMask(), true, false,
+ LoadGroup->getDebugLoc());
+ L->insertBefore(LoadGroup);
+ N->insertBefore(LoadGroup);
+ Lane0->setOperand(LoadGroupIdx, L);
+ Lane0->setOperand(WideLoadIdx, N);
+
+ auto *S = new VPWidenStoreRecipe(
+ *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
+ StoreGroup->getAddr(), Lane0, nullptr, true, false,
+ StoreGroup->getDebugLoc());
+ S->insertBefore(StoreGroup);
+ StoreGroup->eraseFromParent();
+ Changed = true;
+ }
+
+ if (!Changed)
+ return false;
+
+ // Adjust induction to reflect that the transformed plan only processes one
+ // original iteration.
+ auto *CanIV = Plan.getCanonicalIV();
+ VPInstruction *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
+ Inc->setOperand(
+ 1, Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+ removeDeadRecipes(Plan);
+ return true;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 3b792ee32dce6e..0bb9d0a7fde853 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -121,6 +121,8 @@ struct VPlanTransforms {
/// Remove dead recipes from \p Plan.
static void removeDeadRecipes(VPlan &Plan);
+
+ static bool narrowInterleaveGroups(VPlan &Plan, ElementCount VF);
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index d6cd44d0c4f0ca..ffe66ff0007fbb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -20,28 +20,16 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP8]], align 8
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT: [[TMP14:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
-; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC2]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 3
-; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC3]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 -3
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP13]], <16 x i64> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i64> [[STRIDED_VEC2]], [[WIDE_LOAD]]
+; CHECK-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP3]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -50,23 +38,23 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
-; CHECK-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT: [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
; CHECK-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
-; CHECK-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT: [[MUL_0:%.*]] = mul i64 [[L_2]], [[L_0]]
; CHECK-NEXT: store i64 [[MUL_0]], ptr [[DATA_0]], align 8
; CHECK-NEXT: [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 1
; CHECK-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
-; CHECK-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT: [[MUL_1:%.*]] = mul i64 [[L_2]], [[L_1]]
; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8
-; CHECK-NEXT: [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 2
-; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
-; CHECK-NEXT: [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
-; CHECK-NEXT: store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; CHECK-NEXT: [[DATA_4:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 2
+; CHECK-NEXT: [[L_4:%.*]] = load i64, ptr [[DATA_4]], align 8
+; CHECK-NEXT: [[MUL_2:%.*]] = mul i64 [[L_2]], [[L_4]]
+; CHECK-NEXT: store i64 [[MUL_2]], ptr [[DATA_4]], align 8
; CHECK-NEXT: [[DATA_3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 3
; CHECK-NEXT: [[L_3:%.*]] = load i64, ptr [[DATA_3]], align 8
-; CHECK-NEXT: [[MUL_3:%.*]] = mul i64 [[L_FACTOR]], [[L_3]]
+; CHECK-NEXT: [[MUL_3:%.*]] = mul i64 [[L_2]], [[L_3]]
; CHECK-NEXT: store i64 [[MUL_3]], ptr [[DATA_3]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
@@ -129,13 +117,10 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
-; CHECK-NEXT: [[TMP7:%.*]] = or disjoint i64 [[TMP3]], 1
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -212,14 +197,11 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP3]], 1
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -1
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -297,13 +279,10 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
-; CHECK-NEXT: [[TMP7:%.*]] = or disjoint i64 [[TMP3]], 1
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -458,6 +437,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[GEP_SRC_0]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
; CHECK-NEXT: [[TMP5:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP5]]
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
; CHECK-NEXT: [[TMP7:%.*]] = or disjoint i64 [[TMP5]], 1
@@ -469,7 +449,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
; CHECK-NEXT: [[TMP12:%.*]] = mul <4 x i64> [[WIDE_LOAD2]], [[STRIDED_VEC]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP14]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP13]], align 8
@@ -554,14 +534,12 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 2
; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC2]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -2
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> [[TMP11]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP12]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 8
+; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -693,14 +671,12 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
; CHECK-NEXT: [[TMP7:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC2]]
; CHECK-NEXT: [[TMP8:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 2
; CHECK-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 -2
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP12]], <16 x i32> [[TMP13]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP14]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 8
+; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
>From 00471e2d90ca8bf6f5bccb3cec632060240b6b3c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 25 Sep 2024 20:53:01 +0100
Subject: [PATCH 4/7] !fixp address latest comments.
---
llvm/lib/Transforms/Vectorize/VPlan.cpp | 1 -
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 1 -
2 files changed, 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 6ddbfcf0ecfe58..a18ab844c5ebec 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -954,7 +954,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
// FIXME: Model VF * UF computation completely in VPlan.
- assert(VFxUF.getNumUsers() && "VFxUF expected to always have users");
unsigned UF = getUF();
if (VF.getNumUsers()) {
Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 285c5be7a105d5..219ffc61b9ecaa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -668,7 +668,6 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
unsigned BestUF,
PredicatedScalarEvolution &PSE) {
-
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
VPBasicBlock *ExitingVPBB =
>From 61279d125ce45d95648ca7784b57e388ae7d0661 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 25 Sep 2024 21:32:15 +0100
Subject: [PATCH 5/7] !fixup address latest comments, thanks!
---
.../Transforms/Vectorize/LoopVectorize.cpp | 4 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 41 ++++++++++---------
.../Transforms/Vectorize/VPlanTransforms.h | 2 +-
3 files changed, 24 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cb55b6636fc279..3eb830bcc02098 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7506,10 +7506,8 @@ LoopVectorizationPlanner::executePlan(
VPlanTransforms::unrollByUF(BestVPlan, BestUF,
OrigLoop->getHeader()->getModule()->getContext());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
+ VPlanTransforms::narrowInterleaveGroups(BestVPlan, BestVF);
- if (VPlanTransforms::narrowInterleaveGroups(BestVPlan, BestVF)) {
- LLVM_DEBUG(dbgs() << "Narrowed interleave\n");
- }
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
<< ", UF=" << BestUF << '\n');
BestVPlan.setName("Final VPlan");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 219ffc61b9ecaa..58188379c7fabb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -29,6 +29,9 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/PatternMatch.h"
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
using namespace llvm;
void VPlanTransforms::VPInstructionsToVPRecipes(
@@ -1674,25 +1677,24 @@ static bool supportedLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) {
return false;
}
-/// Returns true of \p IR is a consecutive interleave group with \p VF members.
+/// Returns true if \p IR is a full interleave group with factor and number of
+/// members both equal to \p VF.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *IR,
ElementCount VF) {
if (!IR)
return false;
auto IG = IR->getInterleaveGroup();
return IG->getFactor() == IG->getNumMembers() &&
- IG->getNumMembers() == VF.getKnownMinValue();
+ IG->getNumMembers() == VF.getFixedValue();
}
-bool VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
+void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
using namespace llvm::VPlanPatternMatch;
if (VF.isScalable())
- return false;
+ return;
- bool Changed = false;
SmallVector<VPInterleaveRecipe *> StoreGroups;
- for (auto &R : make_early_inc_range(
- *Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
+ for (auto &R : *Plan.getVectorLoopRegion()->getEntryBasicBlock()) {
if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
isa<VPCanonicalIVPHIRecipe>(&R))
continue;
@@ -1701,38 +1703,43 @@ bool VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
// * phi recipes other than the canonical induction
// * recipes writing to memory except interleave groups
// Only support plans with a canonical induction phi.
- if ((R.isPhi() && !isa<VPCanonicalIVPHIRecipe>(&R)) ||
- (R.mayWriteToMemory() && !isa<VPInterleaveRecipe>(&R)))
- return false;
+ if (R.isPhi())
+ return;
auto *IR = dyn_cast<VPInterleaveRecipe>(&R);
+ if (R.mayWriteToMemory() && !IR)
+ return;
+
if (!IR)
continue;
if (!isConsecutiveInterleaveGroup(IR, VF))
- return false;
+ return;
if (IR->getStoredValues().empty())
continue;
auto *Lane0 = dyn_cast_or_null<VPWidenRecipe>(
IR->getStoredValues()[0]->getDefiningRecipe());
if (!Lane0)
- return false;
+ return;
for (const auto &[I, V] : enumerate(IR->getStoredValues())) {
auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe());
if (!R || R->getOpcode() != Lane0->getOpcode())
- return false;
+ return;
// Work around captured structured bindings being a C++20 extension.
auto Idx = I;
if (any_of(R->operands(), [Lane0, Idx](VPValue *V) {
return !supportedLoad(Lane0, V, Idx);
}))
- return false;
+ return;
}
StoreGroups.push_back(IR);
}
+ if (StoreGroups.empty())
+ return;
+
// Narrow operation tree rooted at store groups.
for (auto *StoreGroup : StoreGroups) {
auto *Lane0 = cast<VPWidenRecipe>(
@@ -1769,12 +1776,8 @@ bool VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
StoreGroup->getDebugLoc());
S->insertBefore(StoreGroup);
StoreGroup->eraseFromParent();
- Changed = true;
}
- if (!Changed)
- return false;
-
// Adjust induction to reflect that the transformed plan only processes one
// original iteration.
auto *CanIV = Plan.getCanonicalIV();
@@ -1782,5 +1785,5 @@ bool VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
Inc->setOperand(
1, Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
removeDeadRecipes(Plan);
- return true;
+ LLVM_DEBUG(dbgs() << "Narrowed interleave\n");
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 0bb9d0a7fde853..efe4bde8998fd4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -122,7 +122,7 @@ struct VPlanTransforms {
/// Remove dead recipes from \p Plan.
static void removeDeadRecipes(VPlan &Plan);
- static bool narrowInterleaveGroups(VPlan &Plan, ElementCount VF);
+ static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF);
};
} // namespace llvm
>From 3637cfb0b11a625d7303f385524641402f5b2c0c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 26 Sep 2024 19:24:47 +0100
Subject: [PATCH 6/7] !fixup address latest comments, thanks!
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 58188379c7fabb..b51a28dafea7ad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -713,7 +713,6 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
// TODO: Further simplifications are possible
// 1. Replace inductions with constants.
// 2. Replace vector loop region with VPBasicBlock.
- //
}
/// Sink users of \p FOR after the recipe defining the previous value \p
@@ -1664,8 +1663,6 @@ void VPlanTransforms::createInterleaveGroups(
static bool supportedLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) {
if (auto *W = dyn_cast_or_null<VPWidenLoadRecipe>(V->getDefiningRecipe())) {
- if (W->getMask())
- return false;
return !W->getMask() && (R0->getOperand(0) == V || R0->getOperand(1) == V);
}
@@ -1726,9 +1723,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe());
if (!R || R->getOpcode() != Lane0->getOpcode())
return;
- // Work around captured structured bindings being a C++20 extension.
- auto Idx = I;
- if (any_of(R->operands(), [Lane0, Idx](VPValue *V) {
+ if (any_of(R->operands(), [Lane0, Idx=I](VPValue *V) {
return !supportedLoad(Lane0, V, Idx);
}))
return;
>From f2dcf3d2c9c1c94704355d157b6ec6d20a2dce6e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 27 Sep 2024 09:37:21 +0100
Subject: [PATCH 7/7] !fix formatting
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b51a28dafea7ad..a1144fd9d801c7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1723,7 +1723,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe());
if (!R || R->getOpcode() != Lane0->getOpcode())
return;
- if (any_of(R->operands(), [Lane0, Idx=I](VPValue *V) {
+ if (any_of(R->operands(), [Lane0, Idx = I](VPValue *V) {
return !supportedLoad(Lane0, V, Idx);
}))
return;
More information about the llvm-commits
mailing list