[llvm] [VPlan] Add transformation to narrow interleave groups. (PR #106441)

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Sat Mar 22 14:21:53 PDT 2025


https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/106441

>From 36c68a2c088a0edb15d57b58008b3e37a3db45bf Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 25 Sep 2024 16:53:00 +0100
Subject: [PATCH 01/19] [VPlan] Use pointer to member 0 as VPInterleaveRecipe's
 pointer arg.

---
 .../Vectorize/LoopVectorizationPlanner.h      | 10 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  4 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  8 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 37 +++----
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 46 +++++++--
 .../Transforms/Vectorize/VPlanTransforms.h    |  4 +-
 ...rleaved-store-of-first-order-recurrence.ll |  3 +-
 .../AArch64/sve-interleaved-accesses.ll       | 30 +++---
 .../sve-interleaved-masked-accesses.ll        | 16 ++-
 .../RISCV/interleaved-accesses.ll             | 99 +++----------------
 .../LoopVectorize/X86/interleave-cost.ll      | 17 ++--
 .../X86/vectorize-interleaved-accesses-gap.ll |  5 +-
 .../x86-interleaved-accesses-masked-group.ll  | 18 ++--
 ...86-interleaved-store-accesses-with-gaps.ll | 12 +--
 ...aved-accesses-different-insert-position.ll |  8 +-
 .../LoopVectorize/interleaved-accesses.ll     | 25 ++---
 .../LoopVectorize/vplan-printing.ll           |  5 +-
 17 files changed, 140 insertions(+), 207 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 00eec0a6f7b14..5951873a960af 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -220,9 +220,15 @@ class VPBuilder {
         new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
   }
 
-  VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL,
+  VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
                               const Twine &Name = "") {
-    return createInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, DL, Name);
+    return tryInsertInstruction(new VPInstruction(
+        Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(false), DL, Name));
+  }
+  VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
+                                const Twine &Name = "") {
+    return tryInsertInstruction(new VPInstruction(
+        Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(true), DL, Name));
   }
 
   VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index db4631e19c11d..30fadc2c93941 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9005,8 +9005,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   // Interleave memory: for each Interleave Group we marked earlier as relevant
   // for this VPlan, replace the Recipes widening its memory instructions with a
   // single VPInterleaveRecipe at its insertion point.
-  VPlanTransforms::createInterleaveGroups(InterleaveGroups, RecipeBuilder,
-                                          CM.isScalarEpilogueAllowed());
+  VPlanTransforms::createInterleaveGroups(
+      *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
 
   for (ElementCount VF : Range)
     Plan->addVF(VF);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index bbcfaf9e19cd0..f841355d32093 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -956,7 +956,6 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
     DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {}
   };
 
-protected:
   struct GEPFlagsTy {
     char IsInBounds : 1;
     GEPFlagsTy(bool IsInBounds) : IsInBounds(IsInBounds) {}
@@ -1307,6 +1306,13 @@ class VPInstruction : public VPRecipeWithIRFlags,
     assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint");
   }
 
+  VPInstruction(VPValue *Ptr, VPValue *Offset, GEPFlagsTy Flags = {false},
+                DebugLoc DL = {}, const Twine &Name = "")
+      : VPRecipeWithIRFlags(VPDef::VPInstructionSC,
+                            ArrayRef<VPValue *>({Ptr, Offset}),
+                            GEPFlagsTy(Flags), DL),
+        Opcode(VPInstruction::PtrAdd), Name(Name.str()) {}
+
   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
                 FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = "");
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index dacba152611c1..1a039e0a73614 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -644,7 +644,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
            "can only generate first lane for PtrAdd");
     Value *Ptr = State.get(getOperand(0), /* IsScalar */ true);
     Value *Addend = State.get(getOperand(1), /* IsScalar */ true);
-    return Builder.CreatePtrAdd(Ptr, Addend, Name);
+    return isInBounds() ? Builder.CreateInBoundsPtrAdd(Ptr, Addend, Name)
+                        : Builder.CreatePtrAdd(Ptr, Addend, Name);
   }
   case VPInstruction::ResumePhi: {
     Value *IncomingFromVPlanPred =
@@ -2470,15 +2471,12 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
   unsigned InterleaveFactor = Group->getFactor();
   auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
 
-  // Prepare for the new pointers.
-  unsigned Index = Group->getIndex(Instr);
-
   // TODO: extend the masked interleaved-group support to reversed access.
   VPValue *BlockInMask = getMask();
   assert((!BlockInMask || !Group->isReverse()) &&
          "Reversed masked interleave-group not supported.");
 
-  Value *Idx;
+  Value *Index;
   // If the group is reverse, adjust the index to refer to the last vector lane
   // instead of the first. We adjust the index from the first vector lane,
   // rather than directly getting the pointer for lane VF - 1, because the
@@ -2486,35 +2484,24 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
   if (Group->isReverse()) {
     Value *RuntimeVF =
         getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
-    Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
-    Idx = State.Builder.CreateMul(Idx,
-                                  State.Builder.getInt32(Group->getFactor()));
-    Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
-    Idx = State.Builder.CreateNeg(Idx);
-  } else
-    Idx = State.Builder.getInt32(-Index);
+    Index = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
+    Index = State.Builder.CreateMul(Index,
+                                    State.Builder.getInt32(Group->getFactor()));
+    Index = State.Builder.CreateNeg(Index);
+  } else {
+    // TODO: Drop redundant 0-index GEP as follow-up.
+    Index = State.Builder.getInt32(0);
+  }
 
   VPValue *Addr = getAddr();
   Value *ResAddr = State.get(Addr, VPLane(0));
   if (auto *I = dyn_cast<Instruction>(ResAddr))
     State.setDebugLocFrom(I->getDebugLoc());
 
-  // Notice current instruction could be any index. Need to adjust the address
-  // to the member of index 0.
-  //
-  // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
-  //       b = A[i];       // Member of index 0
-  // Current pointer is pointed to A[i+1], adjust it to A[i].
-  //
-  // E.g.  A[i+1] = a;     // Member of index 1
-  //       A[i]   = b;     // Member of index 0
-  //       A[i+2] = c;     // Member of index 2 (Current instruction)
-  // Current pointer is pointed to A[i+2], adjust it to A[i].
-
   bool InBounds = false;
   if (auto *gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
     InBounds = gep->isInBounds();
-  ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Idx, "", InBounds);
+  ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds);
 
   State.setDebugLocFrom(Instr->getDebugLoc());
   Value *PoisonVec = PoisonValue::get(VecTy);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a878613c4ba48..3b7f066f0636f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1590,14 +1590,19 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
 }
 
 void VPlanTransforms::createInterleaveGroups(
-    const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> &InterleaveGroups,
+    VPlan &Plan,
+    const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
+        &InterleaveGroups,
     VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed) {
+  if (InterleaveGroups.empty())
+    return;
+
   // Interleave memory: for each Interleave Group we marked earlier as relevant
   // for this VPlan, replace the Recipes widening its memory instructions with a
   // single VPInterleaveRecipe at its insertion point.
+  VPDominatorTree VPDT;
+  VPDT.recalculate(Plan);
   for (const auto *IG : InterleaveGroups) {
-    auto *Recipe =
-        cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
     SmallVector<VPValue *, 4> StoredValues;
     for (unsigned i = 0; i < IG->getFactor(); ++i)
       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
@@ -1607,9 +1612,38 @@ void VPlanTransforms::createInterleaveGroups(
 
     bool NeedsMaskForGaps =
         IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed;
-    auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
-                                        Recipe->getMask(), NeedsMaskForGaps);
-    VPIG->insertBefore(Recipe);
+
+    Instruction *IRInsertPos = IG->getInsertPos();
+    auto *InsertPos =
+        cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
+
+    // Get or create the start address for the interleave group.
+    auto *Start =
+        cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
+    VPValue *Addr = Start->getAddr();
+    if (!VPDT.properlyDominates(Addr->getDefiningRecipe(), InsertPos)) {
+      bool InBounds = false;
+      if (auto *Gep = dyn_cast<GetElementPtrInst>(
+              getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
+        InBounds = Gep->isInBounds();
+
+      // We cannot re-use the address of the first member because it does not
+      // dominate the insert position. Use the address of the insert position
+      // and create a PtrAdd to adjust the index to start at the first member.
+      APInt Offset(32,
+                   getLoadStoreType(IRInsertPos)->getScalarSizeInBits() / 8 *
+                       IG->getIndex(IRInsertPos),
+                   /*IsSigned=*/true);
+      VPValue *OffsetVPV = Plan.getOrAddLiveIn(
+          ConstantInt::get(IRInsertPos->getParent()->getContext(), -Offset));
+      VPBuilder B(InsertPos);
+      Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
+                      : B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
+    }
+    auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
+                                        InsertPos->getMask(), NeedsMaskForGaps);
+    VPIG->insertBefore(InsertPos);
+
     unsigned J = 0;
     for (unsigned i = 0; i < IG->getFactor(); ++i)
       if (Instruction *Member = IG->getMember(i)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index f4a17aec42b24..3b792ee32dce6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -114,7 +114,9 @@ struct VPlanTransforms {
   // widening its memory instructions with a single VPInterleaveRecipe at its
   // insertion point.
   static void createInterleaveGroups(
-      const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> &InterleaveGroups,
+      VPlan &Plan,
+      const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
+          &InterleaveGroups,
       VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed);
 
   /// Remove dead recipes from \p Plan.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-store-of-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-store-of-first-order-recurrence.ll
index 87674f611251c..997ef7466d5cf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-store-of-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-store-of-first-order-recurrence.ll
@@ -14,8 +14,7 @@ define void @interleaved_store_first_order_recurrence(ptr noalias %src, ptr %dst
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw nsw i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 2
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 -2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> [[TMP10]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 0959d913fd0cd..ba4145217c3ba 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -41,13 +41,11 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <vscale x 4 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[BROADCAST_SPLAT2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 -4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
-; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -121,6 +119,7 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.+]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
@@ -128,13 +127,11 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER1]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw <vscale x 4 x i32> [[BROADCAST_SPLAT3]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i64> [[TMP7]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
-; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
@@ -404,10 +401,10 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw nsw i32 [[TMP15]], 3
-; CHECK-NEXT:    [[TMP17:%.*]] = sub nsw i32 1, [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
 ; CHECK-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
@@ -715,16 +712,14 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <vscale x 4 x i32> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 -4
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP9]])
-; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -1271,14 +1266,11 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
+; CHECK-NEXT:    [[P:%.+]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
-; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[P]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 73e94f2e7ee09..baec7daa463d1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -55,15 +55,13 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint i32 [[TMP8]], 1
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP13]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP8]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP20]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -135,15 +133,13 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint i32 [[TMP7]], 1
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP13]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP7]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP20]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index 42d3148ac9631..f70e9d6c9416a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -29,12 +29,9 @@ define void @load_store_factor2_i32(ptr %p) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 -1
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP15]])
-; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -77,13 +74,10 @@ define void @load_store_factor2_i32(ptr %p) {
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; FIXED-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
-; FIXED-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP5]]
 ; FIXED-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP6]], i32 -1
 ; FIXED-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; FIXED-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
+; FIXED-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 4
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -135,12 +129,9 @@ define void @load_store_factor2_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP13:%.*]] = add i64 [[TMP7]], 1
-; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP13]]
 ; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 -1
 ; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP15]])
-; SCALABLE-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 4
+; SCALABLE-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -218,12 +209,9 @@ define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[TMP14]], i32 -1
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP15]])
-; CHECK-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8
+; CHECK-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -266,13 +254,10 @@ define void @load_store_factor2_i64(ptr %p) {
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
-; FIXED-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
-; FIXED-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]]
 ; FIXED-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
-; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP6]], i32 -1
 ; FIXED-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; FIXED-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
+; FIXED-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -324,12 +309,9 @@ define void @load_store_factor2_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP13:%.*]] = add i64 [[TMP7]], 1
-; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP13]]
 ; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[TMP14]], i32 -1
 ; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP15]])
-; SCALABLE-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8
+; SCALABLE-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -398,17 +380,13 @@ define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 -2
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -456,17 +434,13 @@ define void @load_store_factor3_i32(ptr %p) {
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
 ; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; FIXED-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
 ; FIXED-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; FIXED-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], 1
-; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP7]]
 ; FIXED-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 -2
 ; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; FIXED-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
+; FIXED-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 4
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -514,17 +488,13 @@ define void @load_store_factor3_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; SCALABLE-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; SCALABLE-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], 1
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP7]]
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 -2
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; SCALABLE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
+; SCALABLE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 4
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -603,17 +573,13 @@ define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], <i64 3, i64 3, i64 3, i64 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 -2
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -661,17 +627,13 @@ define void @load_store_factor3_i64(ptr %p) {
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 ; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
-; FIXED-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
 ; FIXED-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
-; FIXED-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], 1
-; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]]
 ; FIXED-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], <i64 3, i64 3, i64 3, i64 3>
-; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 -2
 ; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; FIXED-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; FIXED-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -719,17 +681,13 @@ define void @load_store_factor3_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
-; SCALABLE-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
-; SCALABLE-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], 1
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]]
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], <i64 3, i64 3, i64 3, i64 3>
-; SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 -2
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; SCALABLE-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; SCALABLE-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -813,22 +771,13 @@ define void @load_store_factor8(ptr %p) {
 ; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
 ; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], <i64 1, i64 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], <i64 3, i64 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP7]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], <i64 4, i64 4>
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP9]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], <i64 5, i64 5>
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP11]], 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], <i64 6, i64 6>
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP13]], 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], <i64 7, i64 7>
-; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[TMP15]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], <i64 8, i64 8>
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[TMP18]], i32 -7
 ; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -837,7 +786,7 @@ define void @load_store_factor8(ptr %p) {
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP20]], align 8
+; CHECK-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -915,22 +864,13 @@ define void @load_store_factor8(ptr %p) {
 ; FIXED-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
 ; FIXED-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
 ; FIXED-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], <i64 1, i64 1>
-; FIXED-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
 ; FIXED-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], <i64 2, i64 2>
-; FIXED-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], 1
 ; FIXED-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], <i64 3, i64 3>
-; FIXED-NEXT:    [[TMP9:%.*]] = add i64 [[TMP7]], 1
 ; FIXED-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], <i64 4, i64 4>
-; FIXED-NEXT:    [[TMP11:%.*]] = add i64 [[TMP9]], 1
 ; FIXED-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], <i64 5, i64 5>
-; FIXED-NEXT:    [[TMP13:%.*]] = add i64 [[TMP11]], 1
 ; FIXED-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], <i64 6, i64 6>
-; FIXED-NEXT:    [[TMP15:%.*]] = add i64 [[TMP13]], 1
 ; FIXED-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], <i64 7, i64 7>
-; FIXED-NEXT:    [[TMP17:%.*]] = add i64 [[TMP15]], 1
-; FIXED-NEXT:    [[TMP18:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP17]]
 ; FIXED-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], <i64 8, i64 8>
-; FIXED-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[TMP18]], i32 -7
 ; FIXED-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; FIXED-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; FIXED-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -939,7 +879,7 @@ define void @load_store_factor8(ptr %p) {
 ; FIXED-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; FIXED-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP20]], align 8
+; FIXED-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; FIXED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1017,22 +957,13 @@ define void @load_store_factor8(ptr %p) {
 ; SCALABLE-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
 ; SCALABLE-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], <i64 1, i64 1>
-; SCALABLE-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], <i64 2, i64 2>
-; SCALABLE-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], 1
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], <i64 3, i64 3>
-; SCALABLE-NEXT:    [[TMP9:%.*]] = add i64 [[TMP7]], 1
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], <i64 4, i64 4>
-; SCALABLE-NEXT:    [[TMP11:%.*]] = add i64 [[TMP9]], 1
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], <i64 5, i64 5>
-; SCALABLE-NEXT:    [[TMP13:%.*]] = add i64 [[TMP11]], 1
 ; SCALABLE-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], <i64 6, i64 6>
-; SCALABLE-NEXT:    [[TMP15:%.*]] = add i64 [[TMP13]], 1
 ; SCALABLE-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], <i64 7, i64 7>
-; SCALABLE-NEXT:    [[TMP17:%.*]] = add i64 [[TMP15]], 1
-; SCALABLE-NEXT:    [[TMP18:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP17]]
 ; SCALABLE-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], <i64 8, i64 8>
-; SCALABLE-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[TMP18]], i32 -7
 ; SCALABLE-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SCALABLE-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SCALABLE-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1041,7 +972,7 @@ define void @load_store_factor8(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SCALABLE-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; SCALABLE-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP20]], align 8
+; SCALABLE-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; SCALABLE-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; SCALABLE-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index cc1d11754b27e..9383799b181c8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -89,12 +89,11 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP40]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP41:%.*]] = shl i64 [[TMP39]], 2
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[TMP41]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = load float, ptr [[P_INVAR]], align 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <2 x float> poison, float [[TMP42]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT28:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT27]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP43:%.*]] = or disjoint i64 [[TMP41]], 3
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[TMP43]]
-; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr float, ptr [[TMP44]], i32 -3
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr float, ptr [[TMP44]], i32 0
 ; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLAT]], <2 x float> [[BROADCAST_SPLAT28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <4 x float> [[TMP46]], <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP47]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
@@ -102,8 +101,8 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali
 ; CHECK-NEXT:    [[TMP48:%.*]] = load float, ptr [[P_INVAR]], align 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <2 x float> poison, float [[TMP48]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT30:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT29]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[TMP43]]
-; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr float, ptr [[TMP49]], i32 -3
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[TMP41]]
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr float, ptr [[TMP49]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT36:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLAT30]], <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLAT36]], <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC31:%.*]] = shufflevector <8 x float> [[TMP51]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
@@ -266,8 +265,7 @@ define void @geps_feeding_interleave_groups_with_reuse(ptr %arg, i64 %arg1, ptr
 ; CHECK-NEXT:    [[TMP35:%.*]] = fmul <2 x float> [[TMP34]], zeroinitializer
 ; CHECK-NEXT:    [[TMP36:%.*]] = fadd <2 x float> [[STRIDED_VEC16]], [[STRIDED_VEC20]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = fmul <2 x float> [[TMP36]], zeroinitializer
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[TMP28]], i64 12
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr float, ptr [[TMP38]], i32 -3
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr float, ptr [[TMP28]], i32 0
 ; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <2 x float> [[TMP31]], <2 x float> [[TMP33]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP41:%.*]] = shufflevector <2 x float> [[TMP35]], <2 x float> [[TMP37]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP42:%.*]] = shufflevector <4 x float> [[TMP40]], <4 x float> [[TMP41]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -490,11 +488,10 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP53]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[STRIDED_VEC34:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP50]]
 ; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[B]], <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP54]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison), !alias.scope [[META6:![0-9]+]]
-; CHECK-NEXT:    [[TMP55:%.*]] = or disjoint i64 [[TMP50]], 7
-; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP55]]
-; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr i32, ptr [[TMP56]], i32 -7
+; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr i32, ptr [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC34]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
index 6e83cf612f82b..0bbf76edde262 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
@@ -67,9 +67,8 @@ define void @test_pr59090(ptr %l_out, ptr noalias %b) #0 {
 ; CHECK-NEXT:    store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
 ; CHECK:       pred.store.continue14:
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[L_OUT:%.*]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 -2
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[L_OUT:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
 ; CHECK-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 ; CHECK-NEXT:    [[TMP15:%.*]] = and <48 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false>
 ; CHECK-NEXT:    call void @llvm.masked.store.v48i8.p0(<48 x i8> <i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison>, ptr [[TMP14]], i32 1, <48 x i1> [[TMP15]])
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index 6b52023cfbcae..968058134690b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -1419,13 +1419,11 @@ define dso_local void @masked_strided2(ptr noalias nocapture readonly %p, ptr no
 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = or disjoint i32 [[TMP1]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC1]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = sub <8 x i8> zeroinitializer, [[TMP4]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP3]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 -1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP7]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP6]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
@@ -2555,13 +2553,11 @@ define dso_local void @masked_strided2_unknown_tc(ptr noalias nocapture readonly
 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = or disjoint i32 [[TMP2]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC3]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP5]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 -1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP9]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP8]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -2989,13 +2985,11 @@ define dso_local void @unconditional_masked_strided2_unknown_tc(ptr noalias noca
 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = or disjoint i32 [[TMP1]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC3]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[Q:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = sub <8 x i8> zeroinitializer, [[TMP4]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[Q:%.*]], i32 [[TMP3]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP7]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP6]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP8]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
index fabe2eb8062bb..b850dc3ecef85 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
@@ -72,17 +72,15 @@ define dso_local void @test1(ptr noalias nocapture %points, ptr noalias nocaptur
 ;
 ; ENABLED_MASKED_STRIDED-LABEL: @test1(
 ; ENABLED_MASKED_STRIDED-NEXT:  entry:
-; ENABLED_MASKED_STRIDED-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[POINTS:%.*]], i64 -2
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[INDEX]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDEX]], 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[POINTS:%.*]], i64 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[INDEX]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP1]], 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP3]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD1]], <16 x i32> <i32 0, i32 4, i32 poison, i32 poison, i32 1, i32 5, i32 poison, i32 poison, i32 2, i32 6, i32 poison, i32 poison, i32 3, i32 7, i32 poison, i32 poison>
 ; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i16.p0(<16 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], i32 2, <16 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -239,7 +237,6 @@ define dso_local void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; ENABLED_MASKED_STRIDED-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[POINTS:%.*]], i64 -2
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -249,11 +246,10 @@ define dso_local void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[INDEX]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP1]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison)
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[INDEX]], 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[INDEX]], 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[POINTS:%.+]], i64 [[TMP2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[INDEX]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP3]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison)
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[TMP2]], 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP4]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_MASKED_LOAD]], <4 x i16> [[WIDE_MASKED_LOAD3]], <16 x i32> <i32 0, i32 4, i32 poison, i32 poison, i32 1, i32 5, i32 poison, i32 poison, i32 2, i32 6, i32 poison, i32 poison, i32 3, i32 7, i32 poison, i32 poison>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
index 5028718dc49d1..8773350bdb424 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
@@ -18,7 +18,8 @@ define void @gep_for_first_member_does_not_dominate_insert_point(ptr %str, ptr n
 ; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[STR]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 -1
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i8>, ptr [[TMP41]], align 1
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i8> [[WIDE_VEC]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i8> [[WIDE_VEC]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i8> [[STRIDED_VEC2]], [[STRIDED_VEC]]
@@ -92,8 +93,9 @@ define void @test_ig_insert_pos_at_end_of_vpbb(ptr noalias %dst, ptr noalias %sr
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr { i16, i16, i16, i16 }, ptr [[SRC]], i64 [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 -2
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 -4
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP51]], align 2
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i16> [[STRIDED_VEC]], i32 3
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 12d0078617d93..cba35dcb78ca4 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -38,13 +38,11 @@ define void @test_array_load2_store2(i32 %C, i32 %D) {
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -115,9 +113,9 @@ define void @test_struct_array_load3_store3() {
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
@@ -270,10 +268,10 @@ define void @test_struct_store4(ptr noalias nocapture readonly %A, ptr noalias n
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
@@ -357,8 +355,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -28
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -24
 ; CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE3]], <4 x i32> [[REVERSE4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -669,15 +667,13 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -4
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -1286,19 +1282,16 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 6
-; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    store i32 [[X:%.*]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 -4
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 26974c2307065..b36e7cf092c85 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -349,10 +349,9 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
 ; CHECK-NEXT:     ir<%AB.0> = load from index 0
 ; CHECK-NEXT:     ir<%AB.1> = load from index 1
 ; CHECK-NEXT:     ir<%AB.3> = load from index 3
-; CHECK-NEXT:   CLONE ir<%iv.plus.3> = add vp<[[STEPS]]>, ir<3>
 ; CHECK-NEXT:   WIDEN ir<%add> = add nsw ir<%AB.0>, ir<%AB.1>
-; CHECK-NEXT:   CLONE ir<%gep.CD.3> = getelementptr inbounds ir<@CD>, ir<0>, ir<%iv.plus.3>
-; CHECK-NEXT:   INTERLEAVE-GROUP with factor 4 at <badref>, ir<%gep.CD.3>
+; CHECK-NEXT:   CLONE ir<%gep.CD.0> = getelementptr inbounds ir<@CD>, ir<0>, vp<[[STEPS]]>
+; CHECK-NEXT:   INTERLEAVE-GROUP with factor 4 at <badref>, ir<%gep.CD.0>
 ; CHECK-NEXT:     store ir<%add> to index 0
 ; CHECK-NEXT:     store ir<1> to index 1
 ; CHECK-NEXT:     store ir<2> to index 2

>From 725a1e7cab8163a9968337ebdd3e0f1269443588 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 15 Aug 2024 13:38:58 +0100
Subject: [PATCH 02/19] [LV] Add

---
 ...sform-narrow-interleave-to-widen-memory.ll |  779 ++++++++
 ...sform-narrow-interleave-to-widen-memory.ll | 1677 +++++++++++++++++
 2 files changed, 2456 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll

diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
new file mode 100644
index 0000000000000..d6cd44d0c4f0c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -0,0 +1,779 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake -S %s | FileCheck %s
+; https://github.com/llvm/llvm-project/issues/82936
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
+define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_4xi64(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 -3
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP13]], <16 x i64> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 1
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 2
+; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
+; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[DATA_3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 3
+; CHECK-NEXT:    [[L_3:%.*]] = load i64, ptr [[DATA_3]], align 8
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul i64 [[L_FACTOR]], [[L_3]]
+; CHECK-NEXT:    store i64 [[MUL_3]], ptr [[DATA_3]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i64 , i64, i64, i64 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i64 , i64, i64, i64 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %data.2 = getelementptr inbounds { i64 , i64, i64, i64 }, ptr %data, i64 %iv, i32 2
+  %l.2 = load i64, ptr %data.2, align 8
+  %mul.2 = mul i64 %l.factor, %l.2
+  store i64 %mul.2, ptr %data.2, align 8
+  %data.3 = getelementptr inbounds { i64 , i64, i64, i64 }, ptr %data, i64 %iv, i32 3
+  %l.3 = load i64, ptr %data.3, align 8
+  %mul.3 = mul i64 %l.factor, %l.3
+  store i64 %mul.3, ptr %data.3, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64_interleave_loads_order_flipped(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -1
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.0 = mul i64 %l.factor, %l.1
+  store i64 %mul.0, ptr %data.0, align 8
+  %mul.1 = mul i64 %l.factor, %l.0
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64_store_order_flipped_1(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.0, align 8
+  store i64 %mul.0, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64_store_order_flipped_2(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP8]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.0, ptr %data.1, align 8
+  store i64 %mul.1, ptr %data.0, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noalias %src.0, ptr noalias %src.1, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64_different_loads_feeding_fmul(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[SRC_0:%.*]], ptr noalias [[SRC_1:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[GEP_SRC_0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <4 x i64> [[WIDE_LOAD2]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP14]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_SRC_0:%.*]] = load i64, ptr [[GEP_SRC_2]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[GEP_SRC_2]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_SRC_0]], [[L_0]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_SRC_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_SRC_1]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src.0 = getelementptr inbounds i64, ptr %src.0, i64 %iv
+  %l.src.0 = load i64, ptr %gep.src.0, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %gep.src.0, align 8
+  %mul.0 = mul i64 %l.src.0, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %gep.src.1 = getelementptr inbounds i64, ptr %src.1, i64 %iv
+  %l.src.1 = load i64, ptr %gep.src.1, align 8
+  %mul.1 = mul i64 %l.src.1, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_3xi64(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -2
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> [[TMP11]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP12]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 1
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 2
+; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
+; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %data.2 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 2
+  %l.2 = load i64, ptr %data.2, align 8
+  %mul.2 = mul i64 %l.factor, %l.2
+  store i64 %mul.2, ptr %data.2, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi32(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i32 , ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i32, ptr %data.0, align 8
+  %mul.0 = mul i32 %l.factor, %l.0
+  store i32 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i32, ptr %data.1, align 8
+  %mul.1 = mul i32 %l.factor, %l.1
+  store i32%mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_3xi32(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 8, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <24 x i32>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 -2
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP12]], <16 x i32> [[TMP13]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP14]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV1]], i32 0
+; CHECK-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV1]], i32 1
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV1]], i32 2
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[L_FACTOR]], [[L_2]]
+; CHECK-NEXT:    store i32 [[MUL_2]], ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i32 , ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i32, ptr %data.0, align 8
+  %mul.0 = mul i32 %l.factor, %l.0
+  store i32 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i32, ptr %data.1, align 8
+  %mul.1 = mul i32 %l.factor, %l.1
+  store i32%mul.1, ptr %data.1, align 8
+  %data.2 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 2
+  %l.2 = load i32, ptr %data.2, align 8
+  %mul.2 = mul i32 %l.factor, %l.2
+  store i32 %mul.2, ptr %data.2, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
new file mode 100644
index 0000000000000..accd340a0d8ea
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
@@ -0,0 +1,1677 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=2 -S %s | FileCheck --check-prefixes=VF2 %s
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck --check-prefixes=VF4 %s
+; https://github.com/llvm/llvm-project/issues/82936
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; VF2-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP15]], i32 0
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP16]]
+; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP15]], i32 1
+; VF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]]
+; VF2-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 8
+; VF2-NEXT:    [[TMP22:%.*]] = insertelement <2 x i64> poison, i64 [[TMP20]], i32 0
+; VF2-NEXT:    [[TMP23:%.*]] = insertelement <2 x i64> [[TMP22]], i64 [[TMP21]], i32 1
+; VF2-NEXT:    [[TMP24:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP23]]
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP24]], i32 0
+; VF2-NEXT:    store i64 [[TMP25]], ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x i64> [[TMP24]], i32 1
+; VF2-NEXT:    store i64 [[TMP26]], ptr [[TMP19]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[TMP29:%.*]] = or disjoint i64 [[TMP28]], 1
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP29]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; VF4-NEXT:    [[TMP20:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
+; VF4-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
+; VF4-NEXT:    store i64 [[TMP21]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
+; VF4-NEXT:    store i64 [[TMP22]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP30]]
+; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3
+; VF4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP32]]
+; VF4-NEXT:    [[TMP34:%.*]] = load i64, ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP35:%.*]] = load i64, ptr [[TMP29]], align 8
+; VF4-NEXT:    [[TMP36:%.*]] = load i64, ptr [[TMP31]], align 8
+; VF4-NEXT:    [[TMP37:%.*]] = load i64, ptr [[TMP33]], align 8
+; VF4-NEXT:    [[TMP38:%.*]] = insertelement <4 x i64> poison, i64 [[TMP34]], i32 0
+; VF4-NEXT:    [[TMP39:%.*]] = insertelement <4 x i64> [[TMP38]], i64 [[TMP35]], i32 1
+; VF4-NEXT:    [[TMP40:%.*]] = insertelement <4 x i64> [[TMP39]], i64 [[TMP36]], i32 2
+; VF4-NEXT:    [[TMP41:%.*]] = insertelement <4 x i64> [[TMP40]], i64 [[TMP37]], i32 3
+; VF4-NEXT:    [[TMP42:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP41]]
+; VF4-NEXT:    [[TMP43:%.*]] = extractelement <4 x i64> [[TMP42]], i32 0
+; VF4-NEXT:    store i64 [[TMP43]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP44:%.*]] = extractelement <4 x i64> [[TMP42]], i32 1
+; VF4-NEXT:    store i64 [[TMP44]], ptr [[TMP29]], align 8
+; VF4-NEXT:    [[TMP45:%.*]] = extractelement <4 x i64> [[TMP42]], i32 2
+; VF4-NEXT:    store i64 [[TMP45]], ptr [[TMP31]], align 8
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i64> [[TMP42]], i32 3
+; VF4-NEXT:    store i64 [[TMP46]], ptr [[TMP33]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP48]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[TMP49:%.*]] = or disjoint i64 [[TMP48]], 1
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP49]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64_interleave_loads_order_flipped(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; VF2-NEXT:    [[TMP12:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP13]]
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP15]]
+; VF2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[TMP17]], i32 0
+; VF2-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP18]], i32 1
+; VF2-NEXT:    [[TMP21:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP20]]
+; VF2-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP21]], i32 0
+; VF2-NEXT:    store i64 [[TMP22]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i64> [[TMP21]], i32 1
+; VF2-NEXT:    store i64 [[TMP23]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP24:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP24]], i32 0
+; VF2-NEXT:    store i64 [[TMP25]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x i64> [[TMP24]], i32 1
+; VF2-NEXT:    store i64 [[TMP26]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[TMP29:%.*]] = or disjoint i64 [[TMP28]], 1
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP29]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_interleave_loads_order_flipped(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; VF4-NEXT:    [[TMP20:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
+; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP21]]
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP23]]
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP25]]
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP27]]
+; VF4-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP22]], align 8
+; VF4-NEXT:    [[TMP30:%.*]] = load i64, ptr [[TMP24]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP33:%.*]] = insertelement <4 x i64> poison, i64 [[TMP29]], i32 0
+; VF4-NEXT:    [[TMP34:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP30]], i32 1
+; VF4-NEXT:    [[TMP35:%.*]] = insertelement <4 x i64> [[TMP34]], i64 [[TMP31]], i32 2
+; VF4-NEXT:    [[TMP36:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP32]], i32 3
+; VF4-NEXT:    [[TMP37:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP36]]
+; VF4-NEXT:    [[TMP38:%.*]] = extractelement <4 x i64> [[TMP37]], i32 0
+; VF4-NEXT:    store i64 [[TMP38]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP37]], i32 1
+; VF4-NEXT:    store i64 [[TMP39]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = extractelement <4 x i64> [[TMP37]], i32 2
+; VF4-NEXT:    store i64 [[TMP40]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP41:%.*]] = extractelement <4 x i64> [[TMP37]], i32 3
+; VF4-NEXT:    store i64 [[TMP41]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP42:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
+; VF4-NEXT:    [[TMP43:%.*]] = extractelement <4 x i64> [[TMP42]], i32 0
+; VF4-NEXT:    store i64 [[TMP43]], ptr [[TMP22]], align 8
+; VF4-NEXT:    [[TMP44:%.*]] = extractelement <4 x i64> [[TMP42]], i32 1
+; VF4-NEXT:    store i64 [[TMP44]], ptr [[TMP24]], align 8
+; VF4-NEXT:    [[TMP45:%.*]] = extractelement <4 x i64> [[TMP42]], i32 2
+; VF4-NEXT:    store i64 [[TMP45]], ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i64> [[TMP42]], i32 3
+; VF4-NEXT:    store i64 [[TMP46]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP48]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[TMP49:%.*]] = or disjoint i64 [[TMP48]], 1
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP49]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.0 = mul i64 %l.factor, %l.1
+  store i64 %mul.0, ptr %data.0, align 8
+  %mul.1 = mul i64 %l.factor, %l.0
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64_store_order_flipped_1(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; VF2-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT:    [[TMP13:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]]
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP16]]
+; VF2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i32 0
+; VF2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i64> [[TMP20]], i64 [[TMP19]], i32 1
+; VF2-NEXT:    [[TMP22:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP21]]
+; VF2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i64> [[TMP22]], i32 0
+; VF2-NEXT:    store i64 [[TMP23]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP22]], i32 1
+; VF2-NEXT:    store i64 [[TMP24]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    store i64 [[TMP25]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    store i64 [[TMP26]], ptr [[TMP17]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    [[TMP29:%.*]] = or disjoint i64 [[TMP28]], 1
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP29]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_store_order_flipped_1(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; VF4-NEXT:    [[TMP20:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
+; VF4-NEXT:    [[TMP21:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP22]]
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP24]]
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF4-NEXT:    [[TMP30:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP33:%.*]] = load i64, ptr [[TMP29]], align 8
+; VF4-NEXT:    [[TMP34:%.*]] = insertelement <4 x i64> poison, i64 [[TMP30]], i32 0
+; VF4-NEXT:    [[TMP35:%.*]] = insertelement <4 x i64> [[TMP34]], i64 [[TMP31]], i32 1
+; VF4-NEXT:    [[TMP36:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP32]], i32 2
+; VF4-NEXT:    [[TMP37:%.*]] = insertelement <4 x i64> [[TMP36]], i64 [[TMP33]], i32 3
+; VF4-NEXT:    [[TMP38:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP37]]
+; VF4-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP38]], i32 0
+; VF4-NEXT:    store i64 [[TMP39]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = extractelement <4 x i64> [[TMP38]], i32 1
+; VF4-NEXT:    store i64 [[TMP40]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP41:%.*]] = extractelement <4 x i64> [[TMP38]], i32 2
+; VF4-NEXT:    store i64 [[TMP41]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP42:%.*]] = extractelement <4 x i64> [[TMP38]], i32 3
+; VF4-NEXT:    store i64 [[TMP42]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP43:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
+; VF4-NEXT:    store i64 [[TMP43]], ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP44:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
+; VF4-NEXT:    store i64 [[TMP44]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP45:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
+; VF4-NEXT:    store i64 [[TMP45]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; VF4-NEXT:    store i64 [[TMP46]], ptr [[TMP29]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP48]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    [[TMP49:%.*]] = or disjoint i64 [[TMP48]], 1
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP49]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.0, align 8
+  store i64 %mul.0, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64_store_order_flipped_2(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; VF2-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT:    [[TMP13:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]]
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP16]]
+; VF2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i32 0
+; VF2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i64> [[TMP20]], i64 [[TMP19]], i32 1
+; VF2-NEXT:    [[TMP22:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP21]]
+; VF2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    store i64 [[TMP23]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    store i64 [[TMP24]], ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP22]], i32 0
+; VF2-NEXT:    store i64 [[TMP25]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x i64> [[TMP22]], i32 1
+; VF2-NEXT:    store i64 [[TMP26]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    [[TMP29:%.*]] = or disjoint i64 [[TMP28]], 1
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP29]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_store_order_flipped_2(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; VF4-NEXT:    [[TMP20:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
+; VF4-NEXT:    [[TMP21:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP22]]
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP24]]
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF4-NEXT:    [[TMP30:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP33:%.*]] = load i64, ptr [[TMP29]], align 8
+; VF4-NEXT:    [[TMP34:%.*]] = insertelement <4 x i64> poison, i64 [[TMP30]], i32 0
+; VF4-NEXT:    [[TMP35:%.*]] = insertelement <4 x i64> [[TMP34]], i64 [[TMP31]], i32 1
+; VF4-NEXT:    [[TMP36:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP32]], i32 2
+; VF4-NEXT:    [[TMP37:%.*]] = insertelement <4 x i64> [[TMP36]], i64 [[TMP33]], i32 3
+; VF4-NEXT:    [[TMP38:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP37]]
+; VF4-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
+; VF4-NEXT:    store i64 [[TMP39]], ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
+; VF4-NEXT:    store i64 [[TMP40]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP41:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
+; VF4-NEXT:    store i64 [[TMP41]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP42:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; VF4-NEXT:    store i64 [[TMP42]], ptr [[TMP29]], align 8
+; VF4-NEXT:    [[TMP43:%.*]] = extractelement <4 x i64> [[TMP38]], i32 0
+; VF4-NEXT:    store i64 [[TMP43]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP44:%.*]] = extractelement <4 x i64> [[TMP38]], i32 1
+; VF4-NEXT:    store i64 [[TMP44]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP45:%.*]] = extractelement <4 x i64> [[TMP38]], i32 2
+; VF4-NEXT:    store i64 [[TMP45]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i64> [[TMP38]], i32 3
+; VF4-NEXT:    store i64 [[TMP46]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP48]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    [[TMP49:%.*]] = or disjoint i64 [[TMP48]], 1
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP49]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.0, ptr %data.1, align 8
+  store i64 %mul.1, ptr %data.0, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noalias %src.0, ptr noalias %src.1, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64_different_loads_feeding_fmul(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[SRC_0:%.*]], ptr noalias [[SRC_1:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP8:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF2-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0
+; VF2-NEXT:    store i64 [[TMP9]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1
+; VF2-NEXT:    store i64 [[TMP10]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP11:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP12]]
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]]
+; VF2-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> poison, i64 [[TMP16]], i32 0
+; VF2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> [[TMP18]], i64 [[TMP17]], i32 1
+; VF2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP21]], align 8
+; VF2-NEXT:    [[TMP22:%.*]] = mul <2 x i64> [[WIDE_LOAD2]], [[TMP19]]
+; VF2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i64> [[TMP22]], i32 0
+; VF2-NEXT:    store i64 [[TMP23]], ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP22]], i32 1
+; VF2-NEXT:    store i64 [[TMP24]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV]]
+; VF2-NEXT:    [[L_SRC_0:%.*]] = load i64, ptr [[GEP_SRC_0]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[GEP_SRC_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_SRC_0]], [[L_0]]
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[TMP27:%.*]] = or disjoint i64 [[TMP26]], 1
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP27]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV]]
+; VF2-NEXT:    [[L_SRC_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_SRC_1]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_different_loads_feeding_fmul(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[SRC_0:%.*]], ptr noalias [[SRC_1:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP12]], i32 0
+; VF4-NEXT:    store i64 [[TMP13]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP12]], i32 1
+; VF4-NEXT:    store i64 [[TMP14]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP12]], i32 2
+; VF4-NEXT:    store i64 [[TMP15]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP12]], i32 3
+; VF4-NEXT:    store i64 [[TMP16]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
+; VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]]
+; VF4-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
+; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP20]]
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
+; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP22]]
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
+; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP24]]
+; VF4-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP21]], align 8
+; VF4-NEXT:    [[TMP28:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP30:%.*]] = insertelement <4 x i64> poison, i64 [[TMP26]], i32 0
+; VF4-NEXT:    [[TMP31:%.*]] = insertelement <4 x i64> [[TMP30]], i64 [[TMP27]], i32 1
+; VF4-NEXT:    [[TMP32:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP28]], i32 2
+; VF4-NEXT:    [[TMP33:%.*]] = insertelement <4 x i64> [[TMP32]], i64 [[TMP29]], i32 3
+; VF4-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i64, ptr [[TMP34]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP35]], align 8
+; VF4-NEXT:    [[TMP36:%.*]] = mul <4 x i64> [[WIDE_LOAD2]], [[TMP33]]
+; VF4-NEXT:    [[TMP37:%.*]] = extractelement <4 x i64> [[TMP36]], i32 0
+; VF4-NEXT:    store i64 [[TMP37]], ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP38:%.*]] = extractelement <4 x i64> [[TMP36]], i32 1
+; VF4-NEXT:    store i64 [[TMP38]], ptr [[TMP21]], align 8
+; VF4-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP36]], i32 2
+; VF4-NEXT:    store i64 [[TMP39]], ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = extractelement <4 x i64> [[TMP36]], i32 3
+; VF4-NEXT:    store i64 [[TMP40]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV]]
+; VF4-NEXT:    [[L_SRC_0:%.*]] = load i64, ptr [[GEP_SRC_0]], align 8
+; VF4-NEXT:    [[TMP42:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP42]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[GEP_SRC_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_SRC_0]], [[L_0]]
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[TMP43:%.*]] = or disjoint i64 [[TMP42]], 1
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP43]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV]]
+; VF4-NEXT:    [[L_SRC_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_SRC_1]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src.0 = getelementptr inbounds i64, ptr %src.0, i64 %iv
+  %l.src.0 = load i64, ptr %gep.src.0, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %gep.src.0, align 8
+  %mul.0 = mul i64 %l.src.0, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %gep.src.1 = getelementptr inbounds i64, ptr %src.1, i64 %iv
+  %l.src.1 = load i64, ptr %gep.src.1, align 8
+  %mul.1 = mul i64 %l.src.1, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_3xi64(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i32 0
+; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> [[TMP8]], i64 [[TMP7]], i32 1
+; VF2-NEXT:    [[TMP10:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP9]]
+; VF2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0
+; VF2-NEXT:    store i64 [[TMP11]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP10]], i32 1
+; VF2-NEXT:    store i64 [[TMP12]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF2-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[TMP15]], i32 0
+; VF2-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[TMP16]], i32 1
+; VF2-NEXT:    [[TMP19:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP18]]
+; VF2-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP19]], i32 0
+; VF2-NEXT:    store i64 [[TMP20]], ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP21:%.*]] = extractelement <2 x i64> [[TMP19]], i32 1
+; VF2-NEXT:    store i64 [[TMP21]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 2
+; VF2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 2
+; VF2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 8
+; VF2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = insertelement <2 x i64> poison, i64 [[TMP24]], i32 0
+; VF2-NEXT:    [[TMP27:%.*]] = insertelement <2 x i64> [[TMP26]], i64 [[TMP25]], i32 1
+; VF2-NEXT:    [[TMP28:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP27]]
+; VF2-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[TMP28]], i32 0
+; VF2-NEXT:    store i64 [[TMP29]], ptr [[TMP22]], align 8
+; VF2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i64> [[TMP28]], i32 1
+; VF2-NEXT:    store i64 [[TMP30]], ptr [[TMP23]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 2
+; VF2-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
+; VF2-NEXT:    [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
+; VF2-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_3xi64(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i32 0
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP11]], i32 1
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 2
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 3
+; VF4-NEXT:    [[TMP18:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP17]]
+; VF4-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0
+; VF4-NEXT:    store i64 [[TMP19]], ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1
+; VF4-NEXT:    store i64 [[TMP20]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2
+; VF4-NEXT:    store i64 [[TMP21]], ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3
+; VF4-NEXT:    store i64 [[TMP22]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP28:%.*]] = load i64, ptr [[TMP24]], align 8
+; VF4-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP30:%.*]] = load i64, ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = insertelement <4 x i64> poison, i64 [[TMP27]], i32 0
+; VF4-NEXT:    [[TMP32:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP28]], i32 1
+; VF4-NEXT:    [[TMP33:%.*]] = insertelement <4 x i64> [[TMP32]], i64 [[TMP29]], i32 2
+; VF4-NEXT:    [[TMP34:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP30]], i32 3
+; VF4-NEXT:    [[TMP35:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP34]]
+; VF4-NEXT:    [[TMP36:%.*]] = extractelement <4 x i64> [[TMP35]], i32 0
+; VF4-NEXT:    store i64 [[TMP36]], ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP37:%.*]] = extractelement <4 x i64> [[TMP35]], i32 1
+; VF4-NEXT:    store i64 [[TMP37]], ptr [[TMP24]], align 8
+; VF4-NEXT:    [[TMP38:%.*]] = extractelement <4 x i64> [[TMP35]], i32 2
+; VF4-NEXT:    store i64 [[TMP38]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP35]], i32 3
+; VF4-NEXT:    store i64 [[TMP39]], ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 2
+; VF4-NEXT:    [[TMP41:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 2
+; VF4-NEXT:    [[TMP42:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP43:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP44:%.*]] = load i64, ptr [[TMP40]], align 8
+; VF4-NEXT:    [[TMP45:%.*]] = load i64, ptr [[TMP41]], align 8
+; VF4-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP42]], align 8
+; VF4-NEXT:    [[TMP47:%.*]] = load i64, ptr [[TMP43]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = insertelement <4 x i64> poison, i64 [[TMP44]], i32 0
+; VF4-NEXT:    [[TMP49:%.*]] = insertelement <4 x i64> [[TMP48]], i64 [[TMP45]], i32 1
+; VF4-NEXT:    [[TMP50:%.*]] = insertelement <4 x i64> [[TMP49]], i64 [[TMP46]], i32 2
+; VF4-NEXT:    [[TMP51:%.*]] = insertelement <4 x i64> [[TMP50]], i64 [[TMP47]], i32 3
+; VF4-NEXT:    [[TMP52:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP51]]
+; VF4-NEXT:    [[TMP53:%.*]] = extractelement <4 x i64> [[TMP52]], i32 0
+; VF4-NEXT:    store i64 [[TMP53]], ptr [[TMP40]], align 8
+; VF4-NEXT:    [[TMP54:%.*]] = extractelement <4 x i64> [[TMP52]], i32 1
+; VF4-NEXT:    store i64 [[TMP54]], ptr [[TMP41]], align 8
+; VF4-NEXT:    [[TMP55:%.*]] = extractelement <4 x i64> [[TMP52]], i32 2
+; VF4-NEXT:    store i64 [[TMP55]], ptr [[TMP42]], align 8
+; VF4-NEXT:    [[TMP56:%.*]] = extractelement <4 x i64> [[TMP52]], i32 3
+; VF4-NEXT:    store i64 [[TMP56]], ptr [[TMP43]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP57:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP57]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 2
+; VF4-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
+; VF4-NEXT:    [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
+; VF4-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %data.2 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 2
+  %l.2 = load i64, ptr %data.2, align 8
+  %mul.2 = mul i64 %l.factor, %l.2
+  store i64 %mul.2, ptr %data.2, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+define void @test_2xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi32(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
+; VF2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 8
+; VF2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = mul <2 x i32> [[TMP7]], [[TMP13]]
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
+; VF2-NEXT:    store i32 [[TMP15]], ptr [[TMP8]], align 8
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
+; VF2-NEXT:    store i32 [[TMP16]], ptr [[TMP9]], align 8
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP18]], align 8
+; VF2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i32 0
+; VF2-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i32 1
+; VF2-NEXT:    [[TMP23:%.*]] = mul <2 x i32> [[TMP7]], [[TMP22]]
+; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0
+; VF2-NEXT:    store i32 [[TMP24]], ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1
+; VF2-NEXT:    store i32 [[TMP25]], ptr [[TMP18]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF2-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF2-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi32(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i32 0
+; VF4-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 1
+; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 2
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP11]], i32 3
+; VF4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP16]], align 8
+; VF4-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP17]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP18]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = mul <4 x i32> [[TMP15]], [[TMP27]]
+; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP28]], i32 0
+; VF4-NEXT:    store i32 [[TMP29]], ptr [[TMP16]], align 8
+; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP28]], i32 1
+; VF4-NEXT:    store i32 [[TMP30]], ptr [[TMP17]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP28]], i32 2
+; VF4-NEXT:    store i32 [[TMP31]], ptr [[TMP18]], align 8
+; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP28]], i32 3
+; VF4-NEXT:    store i32 [[TMP32]], ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP34:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF4-NEXT:    [[TMP35:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP33]], align 8
+; VF4-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP34]], align 8
+; VF4-NEXT:    [[TMP39:%.*]] = load i32, ptr [[TMP35]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP36]], align 8
+; VF4-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[TMP37]], i32 0
+; VF4-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP38]], i32 1
+; VF4-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP39]], i32 2
+; VF4-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[TMP40]], i32 3
+; VF4-NEXT:    [[TMP45:%.*]] = mul <4 x i32> [[TMP15]], [[TMP44]]
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
+; VF4-NEXT:    store i32 [[TMP46]], ptr [[TMP33]], align 8
+; VF4-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
+; VF4-NEXT:    store i32 [[TMP47]], ptr [[TMP34]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
+; VF4-NEXT:    store i32 [[TMP48]], ptr [[TMP35]], align 8
+; VF4-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
+; VF4-NEXT:    store i32 [[TMP49]], ptr [[TMP36]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP50]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF4-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF4-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i32 , ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i32, ptr %data.0, align 8
+  %mul.0 = mul i32 %l.factor, %l.0
+  store i32 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i32, ptr %data.1, align 8
+  %mul.1 = mul i32 %l.factor, %l.1
+  store i32%mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_3xi32(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
+; VF2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 8
+; VF2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = mul <2 x i32> [[TMP7]], [[TMP13]]
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
+; VF2-NEXT:    store i32 [[TMP15]], ptr [[TMP8]], align 8
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
+; VF2-NEXT:    store i32 [[TMP16]], ptr [[TMP9]], align 8
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP18]], align 8
+; VF2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i32 0
+; VF2-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i32 1
+; VF2-NEXT:    [[TMP23:%.*]] = mul <2 x i32> [[TMP7]], [[TMP22]]
+; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0
+; VF2-NEXT:    store i32 [[TMP24]], ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1
+; VF2-NEXT:    store i32 [[TMP25]], ptr [[TMP18]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 2
+; VF2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 2
+; VF2-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP26]], align 8
+; VF2-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP27]], align 8
+; VF2-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> poison, i32 [[TMP28]], i32 0
+; VF2-NEXT:    [[TMP31:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP29]], i32 1
+; VF2-NEXT:    [[TMP32:%.*]] = mul <2 x i32> [[TMP7]], [[TMP31]]
+; VF2-NEXT:    [[TMP33:%.*]] = extractelement <2 x i32> [[TMP32]], i32 0
+; VF2-NEXT:    store i32 [[TMP33]], ptr [[TMP26]], align 8
+; VF2-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP32]], i32 1
+; VF2-NEXT:    store i32 [[TMP34]], ptr [[TMP27]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF2-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF2-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 2
+; VF2-NEXT:    [[L_2:%.*]] = load i32, ptr [[DATA_2]], align 8
+; VF2-NEXT:    [[MUL_2:%.*]] = mul i32 [[L_FACTOR]], [[L_2]]
+; VF2-NEXT:    store i32 [[MUL_2]], ptr [[DATA_2]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_3xi32(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i32 0
+; VF4-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 1
+; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 2
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP11]], i32 3
+; VF4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP16]], align 8
+; VF4-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP17]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP18]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = mul <4 x i32> [[TMP15]], [[TMP27]]
+; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP28]], i32 0
+; VF4-NEXT:    store i32 [[TMP29]], ptr [[TMP16]], align 8
+; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP28]], i32 1
+; VF4-NEXT:    store i32 [[TMP30]], ptr [[TMP17]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP28]], i32 2
+; VF4-NEXT:    store i32 [[TMP31]], ptr [[TMP18]], align 8
+; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP28]], i32 3
+; VF4-NEXT:    store i32 [[TMP32]], ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP34:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF4-NEXT:    [[TMP35:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP33]], align 8
+; VF4-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP34]], align 8
+; VF4-NEXT:    [[TMP39:%.*]] = load i32, ptr [[TMP35]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP36]], align 8
+; VF4-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[TMP37]], i32 0
+; VF4-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP38]], i32 1
+; VF4-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP39]], i32 2
+; VF4-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[TMP40]], i32 3
+; VF4-NEXT:    [[TMP45:%.*]] = mul <4 x i32> [[TMP15]], [[TMP44]]
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
+; VF4-NEXT:    store i32 [[TMP46]], ptr [[TMP33]], align 8
+; VF4-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
+; VF4-NEXT:    store i32 [[TMP47]], ptr [[TMP34]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
+; VF4-NEXT:    store i32 [[TMP48]], ptr [[TMP35]], align 8
+; VF4-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
+; VF4-NEXT:    store i32 [[TMP49]], ptr [[TMP36]], align 8
+; VF4-NEXT:    [[TMP50:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 2
+; VF4-NEXT:    [[TMP51:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 2
+; VF4-NEXT:    [[TMP52:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP53:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP54:%.*]] = load i32, ptr [[TMP50]], align 8
+; VF4-NEXT:    [[TMP55:%.*]] = load i32, ptr [[TMP51]], align 8
+; VF4-NEXT:    [[TMP56:%.*]] = load i32, ptr [[TMP52]], align 8
+; VF4-NEXT:    [[TMP57:%.*]] = load i32, ptr [[TMP53]], align 8
+; VF4-NEXT:    [[TMP58:%.*]] = insertelement <4 x i32> poison, i32 [[TMP54]], i32 0
+; VF4-NEXT:    [[TMP59:%.*]] = insertelement <4 x i32> [[TMP58]], i32 [[TMP55]], i32 1
+; VF4-NEXT:    [[TMP60:%.*]] = insertelement <4 x i32> [[TMP59]], i32 [[TMP56]], i32 2
+; VF4-NEXT:    [[TMP61:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP57]], i32 3
+; VF4-NEXT:    [[TMP62:%.*]] = mul <4 x i32> [[TMP15]], [[TMP61]]
+; VF4-NEXT:    [[TMP63:%.*]] = extractelement <4 x i32> [[TMP62]], i32 0
+; VF4-NEXT:    store i32 [[TMP63]], ptr [[TMP50]], align 8
+; VF4-NEXT:    [[TMP64:%.*]] = extractelement <4 x i32> [[TMP62]], i32 1
+; VF4-NEXT:    store i32 [[TMP64]], ptr [[TMP51]], align 8
+; VF4-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[TMP62]], i32 2
+; VF4-NEXT:    store i32 [[TMP65]], ptr [[TMP52]], align 8
+; VF4-NEXT:    [[TMP66:%.*]] = extractelement <4 x i32> [[TMP62]], i32 3
+; VF4-NEXT:    store i32 [[TMP66]], ptr [[TMP53]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP67:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP67]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF4-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF4-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 2
+; VF4-NEXT:    [[L_2:%.*]] = load i32, ptr [[DATA_2]], align 8
+; VF4-NEXT:    [[MUL_2:%.*]] = mul i32 [[L_FACTOR]], [[L_2]]
+; VF4-NEXT:    store i32 [[MUL_2]], ptr [[DATA_2]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i32 , ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i32, ptr %data.0, align 8
+  %mul.0 = mul i32 %l.factor, %l.0
+  store i32 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i32, ptr %data.1, align 8
+  %mul.1 = mul i32 %l.factor, %l.1
+  store i32%mul.1, ptr %data.1, align 8
+  %data.2 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 2
+  %l.2 = load i32, ptr %data.2, align 8
+  %mul.2 = mul i32 %l.factor, %l.2
+  store i32 %mul.2, ptr %data.2, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+;.
+; VF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; VF2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; VF2: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; VF2: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; VF2: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; VF2: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; VF2: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; VF2: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; VF2: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; VF2: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; VF2: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; VF2: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; VF2: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.
+; VF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF4: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; VF4: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; VF4: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; VF4: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; VF4: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; VF4: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; VF4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; VF4: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; VF4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; VF4: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; VF4: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; VF4: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; VF4: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.

>From 8252b0cc74c6c1ae7cdc28af51d7a5f82c5bbb7a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 21 Jul 2024 11:24:55 +0100
Subject: [PATCH 03/19] [VPlan] Add transformation to narrow interleave groups.

This patch adds a new narrowInterleaveGroups transfrom, which tries
convert a plan with interleave groups with VF elements to a plan that
instead replaces the interleave groups with wide loads and stores
processing VF elements.

This effectively is a very simple form of loop-aware SLP, where we
use interleave groups to identify candidates.

This initial version is quite restricted and hopefully serves as a
starting point for how to best model those kinds of transforms.

Depends on https://github.com/llvm/llvm-project/pull/106431.

Fixes https://github.com/llvm/llvm-project/issues/82936
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   3 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   4 +
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 128 ++++++++++++++++++
 .../Transforms/Vectorize/VPlanTransforms.h    |   2 +
 ...sform-narrow-interleave-to-widen-memory.ll |  74 ++++------
 5 files changed, 162 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 30fadc2c93941..cb55b6636fc27 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7507,6 +7507,9 @@ LoopVectorizationPlanner::executePlan(
                               OrigLoop->getHeader()->getModule()->getContext());
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
 
+  if (VPlanTransforms::narrowInterleaveGroups(BestVPlan, BestVF)) {
+    LLVM_DEBUG(dbgs() << "Narrowed interleave\n");
+  }
   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
                     << ", UF=" << BestUF << '\n');
   BestVPlan.setName("Final VPlan");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1a039e0a73614..96b1616473818 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -48,6 +48,9 @@ extern cl::opt<unsigned> ForceTargetInstructionCost;
 
 bool VPRecipeBase::mayWriteToMemory() const {
   switch (getVPDefID()) {
+  case VPInstructionSC: {
+    return !Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode());
+  }
   case VPInterleaveSC:
     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
   case VPWidenStoreEVLSC:
@@ -63,6 +66,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPBranchOnMaskSC:
   case VPScalarIVStepsSC:
   case VPPredInstPHISC:
+  case VPVectorPointerSC:
     return false;
   case VPBlendSC:
   case VPReductionEVLSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3b7f066f0636f..285c5be7a105d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -668,6 +668,7 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
 void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
                                          unsigned BestUF,
                                          PredicatedScalarEvolution &PSE) {
+
   assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
   assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
   VPBasicBlock *ExitingVPBB =
@@ -710,6 +711,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
   // TODO: Further simplifications are possible
   //      1. Replace inductions with constants.
   //      2. Replace vector loop region with VPBasicBlock.
+  //
 }
 
 /// Sink users of \p FOR after the recipe defining the previous value \p
@@ -1657,3 +1659,129 @@ void VPlanTransforms::createInterleaveGroups(
       }
   }
 }
+
+static bool supportedLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) {
+  if (auto *W = dyn_cast_or_null<VPWidenLoadRecipe>(V->getDefiningRecipe())) {
+    if (W->getMask())
+      return false;
+    return !W->getMask() && (R0->getOperand(0) == V || R0->getOperand(1) == V);
+  }
+
+  if (auto *IR = dyn_cast_or_null<VPInterleaveRecipe>(V->getDefiningRecipe())) {
+    return IR->getInterleaveGroup()->getFactor() ==
+               IR->getInterleaveGroup()->getNumMembers() &&
+           IR->getVPValue(Idx) == V;
+  }
+  return false;
+}
+
+/// Returns true of \p IR is a consecutive interleave group with \p VF members.
+static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *IR,
+                                         ElementCount VF) {
+  if (!IR)
+    return false;
+  auto IG = IR->getInterleaveGroup();
+  return IG->getFactor() == IG->getNumMembers() &&
+         IG->getNumMembers() == VF.getKnownMinValue();
+}
+
+bool VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
+  using namespace llvm::VPlanPatternMatch;
+  if (VF.isScalable())
+    return false;
+
+  bool Changed = false;
+  SmallVector<VPInterleaveRecipe *> StoreGroups;
+  for (auto &R : make_early_inc_range(
+           *Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
+    if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
+        isa<VPCanonicalIVPHIRecipe>(&R))
+      continue;
+
+    // Bail out on recipes not supported at the moment:
+    //  * phi recipes other than the canonical induction
+    //  * recipes writing to memory except interleave groups
+    // Only support plans with a canonical induction phi.
+    if ((R.isPhi() && !isa<VPCanonicalIVPHIRecipe>(&R)) ||
+        (R.mayWriteToMemory() && !isa<VPInterleaveRecipe>(&R)))
+      return false;
+
+    auto *IR = dyn_cast<VPInterleaveRecipe>(&R);
+    if (!IR)
+      continue;
+
+    if (!isConsecutiveInterleaveGroup(IR, VF))
+      return false;
+    if (IR->getStoredValues().empty())
+      continue;
+
+    auto *Lane0 = dyn_cast_or_null<VPWidenRecipe>(
+        IR->getStoredValues()[0]->getDefiningRecipe());
+    if (!Lane0)
+      return false;
+    for (const auto &[I, V] : enumerate(IR->getStoredValues())) {
+      auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe());
+      if (!R || R->getOpcode() != Lane0->getOpcode())
+        return false;
+      // Work around captured structured bindings being a C++20 extension.
+      auto Idx = I;
+      if (any_of(R->operands(), [Lane0, Idx](VPValue *V) {
+            return !supportedLoad(Lane0, V, Idx);
+          }))
+        return false;
+    }
+
+    StoreGroups.push_back(IR);
+  }
+
+  // Narrow operation tree rooted at store groups.
+  for (auto *StoreGroup : StoreGroups) {
+    auto *Lane0 = cast<VPWidenRecipe>(
+        StoreGroup->getStoredValues()[0]->getDefiningRecipe());
+
+    unsigned LoadGroupIdx =
+        isa<VPInterleaveRecipe>(Lane0->getOperand(1)->getDefiningRecipe()) ? 1
+                                                                           : 0;
+    unsigned WideLoadIdx = 1 - LoadGroupIdx;
+    auto *LoadGroup = cast<VPInterleaveRecipe>(
+        Lane0->getOperand(LoadGroupIdx)->getDefiningRecipe());
+
+    auto *WideLoad = cast<VPWidenLoadRecipe>(
+        Lane0->getOperand(WideLoadIdx)->getDefiningRecipe());
+
+    // Narrow wide load to uniform scalar load, as transformed VPlan will only
+    // process one original iteration.
+    auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(),
+                                    WideLoad->operands(), true);
+    // Narrow interleave group to wide load, as transformed VPlan will only
+    // process one original iteration.
+    auto *L = new VPWidenLoadRecipe(
+        *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
+        LoadGroup->getAddr(), LoadGroup->getMask(), true, false,
+        LoadGroup->getDebugLoc());
+    L->insertBefore(LoadGroup);
+    N->insertBefore(LoadGroup);
+    Lane0->setOperand(LoadGroupIdx, L);
+    Lane0->setOperand(WideLoadIdx, N);
+
+    auto *S = new VPWidenStoreRecipe(
+        *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
+        StoreGroup->getAddr(), Lane0, nullptr, true, false,
+        StoreGroup->getDebugLoc());
+    S->insertBefore(StoreGroup);
+    StoreGroup->eraseFromParent();
+    Changed = true;
+  }
+
+  if (!Changed)
+    return false;
+
+  // Adjust induction to reflect that the transformed plan only processes one
+  // original iteration.
+  auto *CanIV = Plan.getCanonicalIV();
+  VPInstruction *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
+  Inc->setOperand(
+      1, Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+  removeDeadRecipes(Plan);
+  return true;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 3b792ee32dce6..0bb9d0a7fde85 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -121,6 +121,8 @@ struct VPlanTransforms {
 
   /// Remove dead recipes from \p Plan.
   static void removeDeadRecipes(VPlan &Plan);
+
+  static bool narrowInterleaveGroups(VPlan &Plan, ElementCount VF);
 };
 
 } // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index d6cd44d0c4f0c..ffe66ff0007fb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -20,28 +20,16 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 -3
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP13]], <16 x i64> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i64> [[STRIDED_VEC2]], [[WIDE_LOAD]]
+; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -50,23 +38,23 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
-; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
 ; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
 ; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
-; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_2]], [[L_0]]
 ; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
 ; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 1
 ; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_2]], [[L_1]]
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
-; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 2
-; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
-; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
-; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[DATA_4:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 2
+; CHECK-NEXT:    [[L_4:%.*]] = load i64, ptr [[DATA_4]], align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[L_2]], [[L_4]]
+; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[DATA_4]], align 8
 ; CHECK-NEXT:    [[DATA_3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 3
 ; CHECK-NEXT:    [[L_3:%.*]] = load i64, ptr [[DATA_3]], align 8
-; CHECK-NEXT:    [[MUL_3:%.*]] = mul i64 [[L_FACTOR]], [[L_3]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul i64 [[L_2]], [[L_3]]
 ; CHECK-NEXT:    store i64 [[MUL_3]], ptr [[DATA_3]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
@@ -129,13 +117,10 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[TMP3]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -212,14 +197,11 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP3]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -1
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -297,13 +279,10 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[TMP3]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -458,6 +437,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[GEP_SRC_0]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[TMP5]], 1
@@ -469,7 +449,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul <4 x i64> [[WIDE_LOAD2]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP14]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP13]], align 8
@@ -554,14 +534,12 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -2
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> [[TMP11]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP12]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -693,14 +671,12 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 -2
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP12]], <16 x i32> [[TMP13]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP14]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]

>From 00471e2d90ca8bf6f5bccb3cec632060240b6b3c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 25 Sep 2024 20:53:01 +0100
Subject: [PATCH 04/19] !fixp address latest comments.

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp           | 1 -
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 1 -
 2 files changed, 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 6ddbfcf0ecfe5..a18ab844c5ebe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -954,7 +954,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
 
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   // FIXME: Model VF * UF computation completely in VPlan.
-  assert(VFxUF.getNumUsers() && "VFxUF expected to always have users");
   unsigned UF = getUF();
   if (VF.getNumUsers()) {
     Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 285c5be7a105d..219ffc61b9eca 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -668,7 +668,6 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
 void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
                                          unsigned BestUF,
                                          PredicatedScalarEvolution &PSE) {
-
   assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
   assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
   VPBasicBlock *ExitingVPBB =

>From 61279d125ce45d95648ca7784b57e388ae7d0661 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 25 Sep 2024 21:32:15 +0100
Subject: [PATCH 05/19] !fixup address latest comments, thanks!

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  4 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 41 ++++++++++---------
 .../Transforms/Vectorize/VPlanTransforms.h    |  2 +-
 3 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cb55b6636fc27..3eb830bcc0209 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7506,10 +7506,8 @@ LoopVectorizationPlanner::executePlan(
   VPlanTransforms::unrollByUF(BestVPlan, BestUF,
                               OrigLoop->getHeader()->getModule()->getContext());
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
+  VPlanTransforms::narrowInterleaveGroups(BestVPlan, BestVF);
 
-  if (VPlanTransforms::narrowInterleaveGroups(BestVPlan, BestVF)) {
-    LLVM_DEBUG(dbgs() << "Narrowed interleave\n");
-  }
   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
                     << ", UF=" << BestUF << '\n');
   BestVPlan.setName("Final VPlan");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 219ffc61b9eca..58188379c7fab 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -29,6 +29,9 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
 
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
 using namespace llvm;
 
 void VPlanTransforms::VPInstructionsToVPRecipes(
@@ -1674,25 +1677,24 @@ static bool supportedLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) {
   return false;
 }
 
-/// Returns true of \p IR is a consecutive interleave group with \p VF members.
+/// Returns true if \p IR is a full interleave group with factor and number of
+/// members both equal to \p VF.
 static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *IR,
                                          ElementCount VF) {
   if (!IR)
     return false;
   auto IG = IR->getInterleaveGroup();
   return IG->getFactor() == IG->getNumMembers() &&
-         IG->getNumMembers() == VF.getKnownMinValue();
+         IG->getNumMembers() == VF.getFixedValue();
 }
 
-bool VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
+void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
   using namespace llvm::VPlanPatternMatch;
   if (VF.isScalable())
-    return false;
+    return;
 
-  bool Changed = false;
   SmallVector<VPInterleaveRecipe *> StoreGroups;
-  for (auto &R : make_early_inc_range(
-           *Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
+  for (auto &R : *Plan.getVectorLoopRegion()->getEntryBasicBlock()) {
     if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
         isa<VPCanonicalIVPHIRecipe>(&R))
       continue;
@@ -1701,38 +1703,43 @@ bool VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
     //  * phi recipes other than the canonical induction
     //  * recipes writing to memory except interleave groups
     // Only support plans with a canonical induction phi.
-    if ((R.isPhi() && !isa<VPCanonicalIVPHIRecipe>(&R)) ||
-        (R.mayWriteToMemory() && !isa<VPInterleaveRecipe>(&R)))
-      return false;
+    if (R.isPhi())
+      return;
 
     auto *IR = dyn_cast<VPInterleaveRecipe>(&R);
+    if (R.mayWriteToMemory() && !IR)
+      return;
+
     if (!IR)
       continue;
 
     if (!isConsecutiveInterleaveGroup(IR, VF))
-      return false;
+      return;
     if (IR->getStoredValues().empty())
       continue;
 
     auto *Lane0 = dyn_cast_or_null<VPWidenRecipe>(
         IR->getStoredValues()[0]->getDefiningRecipe());
     if (!Lane0)
-      return false;
+      return;
     for (const auto &[I, V] : enumerate(IR->getStoredValues())) {
       auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe());
       if (!R || R->getOpcode() != Lane0->getOpcode())
-        return false;
+        return;
       // Work around captured structured bindings being a C++20 extension.
       auto Idx = I;
       if (any_of(R->operands(), [Lane0, Idx](VPValue *V) {
             return !supportedLoad(Lane0, V, Idx);
           }))
-        return false;
+        return;
     }
 
     StoreGroups.push_back(IR);
   }
 
+  if (StoreGroups.empty())
+    return;
+
   // Narrow operation tree rooted at store groups.
   for (auto *StoreGroup : StoreGroups) {
     auto *Lane0 = cast<VPWidenRecipe>(
@@ -1769,12 +1776,8 @@ bool VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
         StoreGroup->getDebugLoc());
     S->insertBefore(StoreGroup);
     StoreGroup->eraseFromParent();
-    Changed = true;
   }
 
-  if (!Changed)
-    return false;
-
   // Adjust induction to reflect that the transformed plan only processes one
   // original iteration.
   auto *CanIV = Plan.getCanonicalIV();
@@ -1782,5 +1785,5 @@ bool VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
   Inc->setOperand(
       1, Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
   removeDeadRecipes(Plan);
-  return true;
+  LLVM_DEBUG(dbgs() << "Narrowed interleave\n");
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 0bb9d0a7fde85..efe4bde8998fd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -122,7 +122,7 @@ struct VPlanTransforms {
   /// Remove dead recipes from \p Plan.
   static void removeDeadRecipes(VPlan &Plan);
 
-  static bool narrowInterleaveGroups(VPlan &Plan, ElementCount VF);
+  static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF);
 };
 
 } // namespace llvm

>From 3637cfb0b11a625d7303f385524641402f5b2c0c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 26 Sep 2024 19:24:47 +0100
Subject: [PATCH 06/19] !fixup address latest comments, thanks!

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 58188379c7fab..b51a28dafea7a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -713,7 +713,6 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
   // TODO: Further simplifications are possible
   //      1. Replace inductions with constants.
   //      2. Replace vector loop region with VPBasicBlock.
-  //
 }
 
 /// Sink users of \p FOR after the recipe defining the previous value \p
@@ -1664,8 +1663,6 @@ void VPlanTransforms::createInterleaveGroups(
 
 static bool supportedLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) {
   if (auto *W = dyn_cast_or_null<VPWidenLoadRecipe>(V->getDefiningRecipe())) {
-    if (W->getMask())
-      return false;
     return !W->getMask() && (R0->getOperand(0) == V || R0->getOperand(1) == V);
   }
 
@@ -1726,9 +1723,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
       auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe());
       if (!R || R->getOpcode() != Lane0->getOpcode())
         return;
-      // Work around captured structured bindings being a C++20 extension.
-      auto Idx = I;
-      if (any_of(R->operands(), [Lane0, Idx](VPValue *V) {
+      if (any_of(R->operands(), [Lane0, Idx=I](VPValue *V) {
             return !supportedLoad(Lane0, V, Idx);
           }))
         return;

>From f2dcf3d2c9c1c94704355d157b6ec6d20a2dce6e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 27 Sep 2024 09:37:21 +0100
Subject: [PATCH 07/19] !fix formatting

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b51a28dafea7a..a1144fd9d801c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1723,7 +1723,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
       auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe());
       if (!R || R->getOpcode() != Lane0->getOpcode())
         return;
-      if (any_of(R->operands(), [Lane0, Idx=I](VPValue *V) {
+      if (any_of(R->operands(), [Lane0, Idx = I](VPValue *V) {
             return !supportedLoad(Lane0, V, Idx);
           }))
         return;

>From 1937f997be3cf00987cc601994417765ce7bc48e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 17 Dec 2024 20:00:07 +0000
Subject: [PATCH 08/19] !fixup fix after merge, address comments, thanks

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  6 ++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  1 -
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  6 +-
 ...sform-narrow-interleave-to-widen-memory.ll | 39 +++++-------
 ...sform-narrow-interleave-to-widen-memory.ll | 60 +++++++++----------
 5 files changed, 53 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 0ca736c817e82..0c093683a84a4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -940,6 +940,12 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   // FIXME: Model VF * UF computation completely in VPlan.
   unsigned UF = getUF();
+
+  if (VFxUF.getNumUsers() == 0) {
+    assert(VF.getNumUsers() == 0 && "expected no users of VF");
+    return;
+  }
+
   if (VF.getNumUsers()) {
     Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
     VF.setUnderlyingValue(RuntimeVF);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 36216c3076e90..f29c15f86a28c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -91,7 +91,6 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPBlendSC:
   case VPReductionEVLSC:
   case VPReductionSC:
-  case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 843e8daba8b85..25d5a8722b9e2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1926,15 +1926,13 @@ void VPlanTransforms::handleUncountableEarlyExit(
 }
 
 static bool supportedLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) {
-  if (auto *W = dyn_cast_or_null<VPWidenLoadRecipe>(V->getDefiningRecipe())) {
+  if (auto *W = dyn_cast_or_null<VPWidenLoadRecipe>(V->getDefiningRecipe()))
     return !W->getMask() && (R0->getOperand(0) == V || R0->getOperand(1) == V);
-  }
 
-  if (auto *IR = dyn_cast_or_null<VPInterleaveRecipe>(V->getDefiningRecipe())) {
+  if (auto *IR = dyn_cast_or_null<VPInterleaveRecipe>(V->getDefiningRecipe()))
     return IR->getInterleaveGroup()->getFactor() ==
                IR->getInterleaveGroup()->getNumMembers() &&
            IR->getVPValue(Idx) == V;
-  }
   return false;
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index ffe66ff0007fb..1799f69d3d5af 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -112,15 +112,14 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -193,15 +192,14 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -274,15 +272,14 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -355,15 +352,14 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP8]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -442,17 +438,15 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[TMP5]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul <4 x i64> [[WIDE_LOAD2]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP14]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -527,8 +521,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
@@ -539,7 +532,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> [[TMP11]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP12]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -660,12 +653,10 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <24 x i32>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <24 x i32>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
@@ -676,7 +667,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP12]], <16 x i32> [[TMP13]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP14]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
index accd340a0d8ea..ea3e9f6b40295 100644
--- a/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
@@ -22,7 +22,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 1)
 ; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
 ; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
@@ -36,7 +36,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
 ; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP7]], align 8
-; VF2-NEXT:    [[TMP15:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP15:%.*]] = or disjoint <2 x i64> [[TMP3]], splat (i64 1)
 ; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP15]], i32 0
 ; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP16]]
 ; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP15]], i32 1
@@ -51,7 +51,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; VF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x i64> [[TMP24]], i32 1
 ; VF2-NEXT:    store i64 [[TMP26]], ptr [[TMP19]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
@@ -96,7 +96,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
 ; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
 ; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
@@ -122,7 +122,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
 ; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP25:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP25:%.*]] = or disjoint <4 x i64> [[TMP3]], splat (i64 1)
 ; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0
 ; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
 ; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1
@@ -149,7 +149,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i64> [[TMP42]], i32 3
 ; VF4-NEXT:    store i64 [[TMP46]], ptr [[TMP33]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
@@ -220,7 +220,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 1)
 ; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
 ; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
@@ -229,7 +229,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
-; VF2-NEXT:    [[TMP12:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP12:%.*]] = or disjoint <2 x i64> [[TMP3]], splat (i64 1)
 ; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
 ; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP13]]
 ; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
@@ -249,7 +249,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; VF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x i64> [[TMP24]], i32 1
 ; VF2-NEXT:    store i64 [[TMP26]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
@@ -294,7 +294,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
 ; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
 ; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
@@ -311,7 +311,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
-; VF4-NEXT:    [[TMP20:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP20:%.*]] = or disjoint <4 x i64> [[TMP3]], splat (i64 1)
 ; VF4-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
 ; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP21]]
 ; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
@@ -347,7 +347,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i64> [[TMP42]], i32 3
 ; VF4-NEXT:    store i64 [[TMP46]], ptr [[TMP28]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
@@ -418,7 +418,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 1)
 ; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
 ; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
@@ -428,7 +428,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; VF2-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
-; VF2-NEXT:    [[TMP13:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP13:%.*]] = or disjoint <2 x i64> [[TMP3]], splat (i64 1)
 ; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
 ; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]]
 ; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
@@ -447,7 +447,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; VF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
 ; VF2-NEXT:    store i64 [[TMP26]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
@@ -492,7 +492,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
 ; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
 ; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
@@ -510,7 +510,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
 ; VF4-NEXT:    [[TMP20:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
-; VF4-NEXT:    [[TMP21:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP21:%.*]] = or disjoint <4 x i64> [[TMP3]], splat (i64 1)
 ; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
 ; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP22]]
 ; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
@@ -545,7 +545,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
 ; VF4-NEXT:    store i64 [[TMP46]], ptr [[TMP29]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
@@ -616,7 +616,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 1)
 ; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
 ; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
@@ -626,7 +626,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; VF2-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
-; VF2-NEXT:    [[TMP13:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP13:%.*]] = or disjoint <2 x i64> [[TMP3]], splat (i64 1)
 ; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
 ; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]]
 ; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
@@ -645,7 +645,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; VF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x i64> [[TMP22]], i32 1
 ; VF2-NEXT:    store i64 [[TMP26]], ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
@@ -690,7 +690,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
 ; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
 ; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
@@ -708,7 +708,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
 ; VF4-NEXT:    [[TMP20:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
-; VF4-NEXT:    [[TMP21:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP21:%.*]] = or disjoint <4 x i64> [[TMP3]], splat (i64 1)
 ; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
 ; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP22]]
 ; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
@@ -743,7 +743,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i64> [[TMP38]], i32 3
 ; VF4-NEXT:    store i64 [[TMP46]], ptr [[TMP11]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
@@ -814,7 +814,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[TMP0]]
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 1)
 ; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
 ; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
@@ -825,7 +825,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; VF2-NEXT:    store i64 [[TMP9]], ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1
 ; VF2-NEXT:    store i64 [[TMP10]], ptr [[TMP7]], align 8
-; VF2-NEXT:    [[TMP11:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP11:%.*]] = or disjoint <2 x i64> [[TMP3]], splat (i64 1)
 ; VF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP12]]
 ; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
@@ -843,7 +843,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP22]], i32 1
 ; VF2-NEXT:    store i64 [[TMP24]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VF2-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
@@ -890,7 +890,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[TMP0]]
 ; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
 ; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
@@ -909,7 +909,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; VF4-NEXT:    store i64 [[TMP15]], ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP12]], i32 3
 ; VF4-NEXT:    store i64 [[TMP16]], ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP17:%.*]] = or disjoint <4 x i64> [[TMP3]], splat (i64 1)
 ; VF4-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
 ; VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]]
 ; VF4-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
@@ -939,7 +939,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; VF4-NEXT:    [[TMP40:%.*]] = extractelement <4 x i64> [[TMP36]], i32 3
 ; VF4-NEXT:    store i64 [[TMP40]], ptr [[TMP25]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VF4-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:

>From 9312264f2c95e2b17721558321edbf7a5952b66a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 18 Dec 2024 20:00:00 +0000
Subject: [PATCH 09/19] !fixup address latest comments, thanks!

---
 .../lib/Transforms/Vectorize/VPlanTransforms.cpp | 16 ++++++++--------
 llvm/lib/Transforms/Vectorize/VPlanTransforms.h  |  5 +++++
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index bc9fd7be7b171..cd61d7e79021e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1963,8 +1963,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
 
   SmallVector<VPInterleaveRecipe *> StoreGroups;
   for (auto &R : *Plan.getVectorLoopRegion()->getEntryBasicBlock()) {
-    if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
-        isa<VPCanonicalIVPHIRecipe>(&R))
+    if (isa<VPCanonicalIVPHIRecipe>(&R) ||
+        match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
       continue;
 
     // Bail out on recipes not supported at the moment:
@@ -2024,13 +2024,13 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
     // Narrow wide load to uniform scalar load, as transformed VPlan will only
     // process one original iteration.
     auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(),
-                                    WideLoad->operands(), true);
+                                    WideLoad->operands(), /*IsUniform*/ true);
     // Narrow interleave group to wide load, as transformed VPlan will only
     // process one original iteration.
     auto *L = new VPWidenLoadRecipe(
         *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
-        LoadGroup->getAddr(), LoadGroup->getMask(), true, false,
-        LoadGroup->getDebugLoc());
+        LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
+        /*Reverse=*/false, LoadGroup->getDebugLoc());
     L->insertBefore(LoadGroup);
     N->insertBefore(LoadGroup);
     Lane0->setOperand(LoadGroupIdx, L);
@@ -2038,8 +2038,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
 
     auto *S = new VPWidenStoreRecipe(
         *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
-        StoreGroup->getAddr(), Lane0, nullptr, true, false,
-        StoreGroup->getDebugLoc());
+        StoreGroup->getAddr(), Lane0, nullptr, /*Consecutive=*/true,
+        /*Reverse=*/false, StoreGroup->getDebugLoc());
     S->insertBefore(StoreGroup);
     StoreGroup->eraseFromParent();
   }
@@ -2047,7 +2047,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
   // Adjust induction to reflect that the transformed plan only processes one
   // original iteration.
   auto *CanIV = Plan.getCanonicalIV();
-  VPInstruction *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
+  auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
   Inc->setOperand(
       1, Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
   removeDeadRecipes(Plan);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index faa37b8e61565..eea105cb3c4d0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -138,6 +138,11 @@ struct VPlanTransforms {
   /// Lower abstract recipes to concrete ones, that can be codegen'd.
   static void convertToConcreteRecipes(VPlan &Plan);
 
+  /// Try to convert a plan with interleave groups with VF elements to a plan
+  /// with the
+  /// the interleave groups replaced by wide loads and stores processing VF
+  /// elements. This effectively is a very simple form of loop-aware SLP, where
+  /// we use interleave groups to identify candidates.
   static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF);
 };
 

>From b5ada93bce58a5f81ad0a35bfa59f325da7ddf7c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 7 Jan 2025 15:23:13 +0000
Subject: [PATCH 10/19] !fixup after merging main

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp           | 5 -----
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 5 +++--
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index df1aaac4ce466..c3b4013371ce7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -938,11 +938,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
   // FIXME: Model VF * UF computation completely in VPlan.
   unsigned UF = getUF();
 
-  if (VFxUF.getNumUsers() == 0) {
-    assert(VF.getNumUsers() == 0 && "expected no users of VF");
-    return;
-  }
-
   if (VF.getNumUsers()) {
     Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
     VF.setUnderlyingValue(RuntimeVF);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 55860f5578089..57b004c6a4edb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2022,11 +2022,12 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *IR,
 
 void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
   using namespace llvm::VPlanPatternMatch;
-  if (VF.isScalable())
+  VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
+  if (VF.isScalable() || !VectorLoop)
     return;
 
   SmallVector<VPInterleaveRecipe *> StoreGroups;
-  for (auto &R : *Plan.getVectorLoopRegion()->getEntryBasicBlock()) {
+  for (auto &R : *VectorLoop->getEntryBasicBlock()) {
     if (isa<VPCanonicalIVPHIRecipe>(&R) ||
         match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
       continue;

>From 95cf546a26dbb4b1220a427150d0e622c121fca2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 7 Jan 2025 15:35:30 +0000
Subject: [PATCH 11/19] !fixup remove stray whitespace

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index c3b4013371ce7..97b738e39955b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -937,7 +937,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   // FIXME: Model VF * UF computation completely in VPlan.
   unsigned UF = getUF();
-
   if (VF.getNumUsers()) {
     Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
     VF.setUnderlyingValue(RuntimeVF);

>From 1110761ec188dcdc72f9db095f4b79bc0e876738 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 7 Jan 2025 16:20:39 +0000
Subject: [PATCH 12/19] !fixup reflow comment

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index eea105cb3c4d0..86e1e95890f1d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -139,8 +139,7 @@ struct VPlanTransforms {
   static void convertToConcreteRecipes(VPlan &Plan);
 
   /// Try to convert a plan with interleave groups with VF elements to a plan
-  /// with the
-  /// the interleave groups replaced by wide loads and stores processing VF
+  /// with the interleave groups replaced by wide loads and stores processing VF
   /// elements. This effectively is a very simple form of loop-aware SLP, where
   /// we use interleave groups to identify candidates.
   static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF);

>From 3494339ae27d1b847912ce8adfbb16e9c17f1923 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 15 Mar 2025 17:12:51 +0000
Subject: [PATCH 13/19] !fixup update after merge.

---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 46 +++++++++----------
 ...sform-narrow-interleave-to-widen-memory.ll | 28 +++--------
 ...sform-narrow-interleave-to-widen-memory.ll |  8 ++++
 3 files changed, 38 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b74092547bdcb..720d07a67d690 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2306,35 +2306,35 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
   if (StoreGroups.empty())
     return;
 
-  // Narrow operation tree rooted at store groups.
-  for (auto *StoreGroup : StoreGroups) {
-    auto *Lane0 = cast<VPWidenRecipe>(
-        StoreGroup->getStoredValues()[0]->getDefiningRecipe());
-
-    unsigned LoadGroupIdx =
-        isa<VPInterleaveRecipe>(Lane0->getOperand(1)->getDefiningRecipe()) ? 1
-                                                                           : 0;
-    unsigned WideLoadIdx = 1 - LoadGroupIdx;
-    auto *LoadGroup = cast<VPInterleaveRecipe>(
-        Lane0->getOperand(LoadGroupIdx)->getDefiningRecipe());
+  auto Narrow = [](VPRecipeBase *R) -> VPValue * {
+    if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
+      // Narrow interleave group to wide load, as transformed VPlan will only
+      // process one original iteration.
+      auto *L = new VPWidenLoadRecipe(
+          *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
+          LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
+          /*Reverse=*/false, LoadGroup->getDebugLoc());
+      L->insertBefore(LoadGroup);
+      return L;
+    }
 
-    auto *WideLoad = cast<VPWidenLoadRecipe>(
-        Lane0->getOperand(WideLoadIdx)->getDefiningRecipe());
+    auto *WideLoad = cast<VPWidenLoadRecipe>(R);
 
     // Narrow wide load to uniform scalar load, as transformed VPlan will only
     // process one original iteration.
     auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(),
                                     WideLoad->operands(), /*IsUniform*/ true);
-    // Narrow interleave group to wide load, as transformed VPlan will only
-    // process one original iteration.
-    auto *L = new VPWidenLoadRecipe(
-        *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
-        LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
-        /*Reverse=*/false, LoadGroup->getDebugLoc());
-    L->insertBefore(LoadGroup);
-    N->insertBefore(LoadGroup);
-    Lane0->setOperand(LoadGroupIdx, L);
-    Lane0->setOperand(WideLoadIdx, N);
+    N->insertBefore(WideLoad);
+    return N;
+  };
+
+  // Narrow operation tree rooted at store groups.
+  for (auto *StoreGroup : StoreGroups) {
+    auto *Lane0 = cast<VPWidenRecipe>(
+        StoreGroup->getStoredValues()[0]->getDefiningRecipe());
+
+    Lane0->setOperand(0, Narrow(Lane0->getOperand(0)->getDefiningRecipe()));
+    Lane0->setOperand(1, Narrow(Lane0->getOperand(1)->getDefiningRecipe()));
 
     auto *S = new VPWidenStoreRecipe(
         *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
index 3fca274a3bb12..a0d9327de5aed 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
@@ -122,31 +122,17 @@ define void @test_complex_add_double(ptr %res, ptr noalias %A, ptr noalias %B, i
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[WIDE_VEC5:%.*]] = load <4 x double>, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <4 x double>, ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[STRIDED_VEC6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[STRIDED_VEC9]]
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = load <2 x double>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = load <2 x double>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[STRIDED_VEC7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[STRIDED_VEC10]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    store <2 x double> [[TMP8]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index 755ac74c90893..11994ff6398f4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -20,6 +20,14 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; CHECK-NEXT:    store <4 x i64> [[TMP5]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:

>From 3fd2b8d9668bdd235f393d7ba55b5b762e3b3ff1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 16 Mar 2025 09:13:42 +0000
Subject: [PATCH 14/19] !fixup address comments, thanks!

---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  73 ++++++---
 ...nterleave-to-widen-memory-with-wide-ops.ll |  26 ++--
 ...sform-narrow-interleave-to-widen-memory.ll | 145 ++++++++++++++++--
 3 files changed, 189 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 720d07a67d690..ceeaadcabac1f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2234,10 +2234,13 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
 }
 
 static bool supportedLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) {
-  if (auto *W = dyn_cast_or_null<VPWidenLoadRecipe>(V->getDefiningRecipe()))
-    return !W->getMask() && (R0->getOperand(0) == V || R0->getOperand(1) == V);
+  auto *DefR = V->getDefiningRecipe();
+  if (!DefR)
+    return false;
+  if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
+    return !W->getMask() && is_contained(R0->operands(), V);
 
-  if (auto *IR = dyn_cast_or_null<VPInterleaveRecipe>(V->getDefiningRecipe()))
+  if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
     return IR->getInterleaveGroup()->getFactor() ==
                IR->getInterleaveGroup()->getNumMembers() &&
            IR->getVPValue(Idx) == V;
@@ -2246,13 +2249,12 @@ static bool supportedLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) {
 
 /// Returns true if \p IR is a full interleave group with factor and number of
 /// members both equal to \p VF.
-static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *IR,
-                                         ElementCount VF) {
-  if (!IR)
+static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
+                                         unsigned VF) {
+  if (!InterleaveR)
     return false;
-  auto IG = IR->getInterleaveGroup();
-  return IG->getFactor() == IG->getNumMembers() &&
-         IG->getNumMembers() == VF.getFixedValue();
+  auto IG = InterleaveR->getInterleaveGroup();
+  return IG->getFactor() == VF && IG->getNumMembers() == VF;
 }
 
 void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
@@ -2261,6 +2263,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
   if (VF.isScalable() || !VectorLoop)
     return;
 
+  unsigned FixedVF = VF.getFixedValue();
   SmallVector<VPInterleaveRecipe *> StoreGroups;
   for (auto &R : *VectorLoop->getEntryBasicBlock()) {
     if (isa<VPCanonicalIVPHIRecipe>(&R) ||
@@ -2274,25 +2277,42 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
     if (R.isPhi())
       return;
 
-    auto *IR = dyn_cast<VPInterleaveRecipe>(&R);
-    if (R.mayWriteToMemory() && !IR)
+    auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
+    if (R.mayWriteToMemory() && !InterleaveR)
       return;
 
-    if (!IR)
+    if (!InterleaveR)
       continue;
 
-    if (!isConsecutiveInterleaveGroup(IR, VF))
+    // Bail out on non-consecutive interleave groups.
+    if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF))
       return;
-    if (IR->getStoredValues().empty())
+
+    // Skip read interleave groups.
+    if (InterleaveR->getStoredValues().empty())
+      continue;
+
+    if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
+          VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
+          if (!DefR)
+            return false;
+          auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
+          return IR &&
+                 IR->getInterleaveGroup()->getFactor() ==
+                     IR->getInterleaveGroup()->getNumMembers() &&
+                 IR->getVPValue(Op.index()) == Op.value();
+        })) {
+      StoreGroups.push_back(InterleaveR);
       continue;
+    }
 
     auto *Lane0 = dyn_cast_or_null<VPWidenRecipe>(
-        IR->getStoredValues()[0]->getDefiningRecipe());
+        InterleaveR->getStoredValues()[0]->getDefiningRecipe());
     if (!Lane0)
       return;
-    for (const auto &[I, V] : enumerate(IR->getStoredValues())) {
+    for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
       auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe());
-      if (!R || R->getOpcode() != Lane0->getOpcode())
+      if (!R || R->getOpcode() != Lane0->getOpcode() || R->getNumOperands() > 2)
         return;
       if (any_of(R->operands(), [Lane0, Idx = I](VPValue *V) {
             return !supportedLoad(Lane0, V, Idx);
@@ -2300,7 +2320,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
         return;
     }
 
-    StoreGroups.push_back(IR);
+    StoreGroups.push_back(InterleaveR);
   }
 
   if (StoreGroups.empty())
@@ -2330,15 +2350,20 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
 
   // Narrow operation tree rooted at store groups.
   for (auto *StoreGroup : StoreGroups) {
-    auto *Lane0 = cast<VPWidenRecipe>(
-        StoreGroup->getStoredValues()[0]->getDefiningRecipe());
-
-    Lane0->setOperand(0, Narrow(Lane0->getOperand(0)->getDefiningRecipe()));
-    Lane0->setOperand(1, Narrow(Lane0->getOperand(1)->getDefiningRecipe()));
+    VPValue *Res = nullptr;
+    if (auto *Lane0 = dyn_cast<VPWidenRecipe>(
+            StoreGroup->getStoredValues()[0]->getDefiningRecipe())) {
+      for (unsigned Idx = 0, E = Lane0->getNumOperands(); Idx != E; ++Idx)
+        Lane0->setOperand(Idx,
+                          Narrow(Lane0->getOperand(Idx)->getDefiningRecipe()));
+      Res = Lane0;
+    } else {
+      Res = Narrow(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
+    }
 
     auto *S = new VPWidenStoreRecipe(
         *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
-        StoreGroup->getAddr(), Lane0, nullptr, /*Consecutive=*/true,
+        StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
         /*Reverse=*/false, StoreGroup->getDebugLoc());
     S->insertBefore(StoreGroup);
     StoreGroup->eraseFromParent();
diff --git a/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
index 146a1c4749f9b..f9bc195c130a6 100644
--- a/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
@@ -16,15 +16,10 @@ define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr no
 ; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP1]]
-; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 8
-; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT:    [[TMP3:%.*]] = fneg <2 x double> [[STRIDED_VEC]]
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
 ; VF2-NEXT:    [[TMP4:%.*]] = fneg <2 x double> [[STRIDED_VEC1]]
-; VF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP2]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
@@ -256,18 +251,15 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor) {
 ; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[TMP0]], 1
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
-; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
-; VF2-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT:    [[TMP23:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP24:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP23]]
-; VF2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP24]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    store <2 x i64> [[TMP24]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
index d8cb8d38e3067..4531ce7b3cecd 100644
--- a/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
@@ -16,13 +16,9 @@ define void @load_store_interleave_group(ptr noalias %data) {
 ; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
-; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP2]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
@@ -120,14 +116,10 @@ define void @load_store_interleave_group_different_objecs(ptr noalias %src, ptr
 ; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
-; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
 ; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP3]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
@@ -323,3 +315,128 @@ loop:
 exit:
   ret void
 }
+
+define void @same_load_group_used_by_multiple_load_groups(ptr noalias %src, ptr noalias %A, ptr noalias %B) {
+; VF2-LABEL: define void @same_load_group_used_by_multiple_load_groups(
+; VF2-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; VF2-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP3]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store <2 x i64> [[WIDE_LOAD1]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[MUL_2]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[SRC_0]], align 8
+; VF2-NEXT:    [[A_0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[MUL_2]]
+; VF2-NEXT:    store i64 [[L_0]], ptr [[A_0]], align 8
+; VF2-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
+; VF2-NEXT:    [[SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[ADD_1]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[SRC_1]], align 8
+; VF2-NEXT:    [[A_1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD_1]]
+; VF2-NEXT:    store i64 [[L_1]], ptr [[A_1]], align 8
+; VF2-NEXT:    [[B_0:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[MUL_2]]
+; VF2-NEXT:    store i64 [[L_0]], ptr [[B_0]], align 8
+; VF2-NEXT:    [[B_1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[ADD_1]]
+; VF2-NEXT:    store i64 [[L_1]], ptr [[B_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @same_load_group_used_by_multiple_load_groups(
+; VF4-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
+; VF4-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; VF4-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[STRIDED_VEC]], <4 x i64> [[STRIDED_VEC1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[MUL_2]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[SRC_0]], align 8
+; VF4-NEXT:    [[A_0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[MUL_2]]
+; VF4-NEXT:    store i64 [[L_0]], ptr [[A_0]], align 8
+; VF4-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
+; VF4-NEXT:    [[SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[ADD_1]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[SRC_1]], align 8
+; VF4-NEXT:    [[A_1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD_1]]
+; VF4-NEXT:    store i64 [[L_1]], ptr [[A_1]], align 8
+; VF4-NEXT:    [[B_0:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[MUL_2]]
+; VF4-NEXT:    store i64 [[L_0]], ptr [[B_0]], align 8
+; VF4-NEXT:    [[B_1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[ADD_1]]
+; VF4-NEXT:    store i64 [[L_1]], ptr [[B_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %mul.2 = shl nsw i64 %iv, 1
+  %src.0 = getelementptr inbounds i64, ptr %src, i64 %mul.2
+  %l.0 = load i64, ptr %src.0, align 8
+  %A.0 = getelementptr inbounds i64, ptr %A, i64 %mul.2
+  store i64 %l.0, ptr %A.0, align 8
+  %add.1 = or disjoint i64 %mul.2, 1
+  %src.1 = getelementptr inbounds i64, ptr %src, i64 %add.1
+  %l.1 = load i64, ptr %src.1, align 8
+  %A.1 = getelementptr inbounds i64, ptr %A, i64 %add.1
+  store i64 %l.1, ptr %A.1, align 8
+  %B.0 = getelementptr inbounds i64, ptr %B, i64 %mul.2
+  store i64 %l.0, ptr %B.0, align 8
+  %B.1 = getelementptr inbounds i64, ptr %B, i64 %add.1
+  store i64 %l.1, ptr %B.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}

>From b9b4fc292783702785b4c6cacda9630ef308d8ed Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 16 Mar 2025 10:22:52 +0000
Subject: [PATCH 15/19] [LV] Reorganize tests for narrowing interleave group
 transforms.

---
 ...-narrow-interleave-to-widen-memory-cost.ll | 204 +++++++
 ...nterleave-to-widen-memory-with-wide-ops.ll |  34 +-
 ...sform-narrow-interleave-to-widen-memory.ll | 508 ++++++++++++------
 ...sform-narrow-interleave-to-widen-memory.ll | 325 -----------
 4 files changed, 565 insertions(+), 506 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
 rename llvm/test/Transforms/LoopVectorize/{ => AArch64}/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll (97%)
 delete mode 100644 llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
new file mode 100644
index 0000000000000..99cdb84f0a6fe
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
@@ -0,0 +1,204 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx"
+
+define void @test_complex_add_float(ptr %res, ptr noalias %A, ptr noalias %B, i64 %N) {
+; CHECK-LABEL: define void @test_complex_add_float(
+; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[GEP_A_0]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x float> [[WIDE_VEC2]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x float> [[WIDE_VEC2]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[WIDE_VEC5:%.*]] = load <8 x float>, ptr [[GEP_B_0]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <8 x float> [[WIDE_VEC5]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <8 x float> [[WIDE_VEC5]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <8 x float>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <8 x float> [[WIDE_VEC8]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <8 x float> [[WIDE_VEC8]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[STRIDED_VEC]], [[STRIDED_VEC6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[STRIDED_VEC3]], [[STRIDED_VEC9]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[STRIDED_VEC1]], [[STRIDED_VEC7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x float> [[STRIDED_VEC4]], [[STRIDED_VEC10]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x float> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[IV1]]
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_A_0:%.*]] = load float, ptr [[GEP_A_2]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_A_2]], i64 4
+; CHECK-NEXT:    [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[L_B_0:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd float [[L_A_0]], [[L_B_0]]
+; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_B_2]], i64 4
+; CHECK-NEXT:    [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd float [[L_A_1]], [[L_B_1]]
+; CHECK-NEXT:    [[GEP_RES_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[IV1]]
+; CHECK-NEXT:    store float [[ADD_0]], ptr [[GEP_RES_0]], align 4
+; CHECK-NEXT:    [[GEP_RES_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_RES_0]], i64 4
+; CHECK-NEXT:    store float [[ADD_1]], ptr [[GEP_RES_1]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.A.0 = getelementptr inbounds nuw { float, float }, ptr %A, i64 %iv
+  %gep.B.0 = getelementptr inbounds nuw { float, float }, ptr %B, i64 %iv
+  %l.A.0 = load float, ptr %gep.A.0, align 4
+  %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A.0, i64 4
+  %l.A.1 = load float, ptr %gep.A.1, align 4
+  %l.B.0 = load float, ptr %gep.B.0, align 4
+  %add.0 = fadd float %l.A.0, %l.B.0
+  %gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B.0, i64 4
+  %l.B.1 = load float, ptr %gep.B.1, align 4
+  %add.1 = fadd float %l.A.1, %l.B.1
+  %gep.res.0 = getelementptr inbounds nuw { float, float }, ptr %res, i64 %iv
+  store float %add.0, ptr %gep.res.0, align 4
+  %gep.res.1 = getelementptr inbounds nuw i8, ptr %gep.res.0, i64 4
+  store float %add.1, ptr %gep.res.1, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_complex_add_double(ptr %res, ptr noalias %A, ptr noalias %B, i64 %N) {
+; CHECK-LABEL: define void @test_complex_add_double(
+; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC5:%.*]] = load <4 x double>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <4 x double>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[STRIDED_VEC6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[STRIDED_VEC9]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[STRIDED_VEC7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[STRIDED_VEC10]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_A_0]], i64 8
+; CHECK-NEXT:    [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd double [[L_A_0]], [[L_B_0]]
+; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_B_0]], i64 8
+; CHECK-NEXT:    [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd double [[L_A_1]], [[L_B_1]]
+; CHECK-NEXT:    [[GEP_RES_0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[IV]]
+; CHECK-NEXT:    store double [[ADD_0]], ptr [[GEP_RES_0]], align 4
+; CHECK-NEXT:    [[GEP_RES_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_RES_0]], i64 8
+; CHECK-NEXT:    store double [[ADD_1]], ptr [[GEP_RES_1]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.A.0 = getelementptr inbounds nuw { double, double }, ptr %A, i64 %iv
+  %gep.B.0 = getelementptr inbounds nuw { double, double }, ptr %B, i64 %iv
+  %l.A.0 = load double, ptr %gep.A.0, align 4
+  %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A.0, i64 8
+  %l.A.1 = load double, ptr %gep.A.1, align 4
+  %l.B.0 = load double, ptr %gep.B.0, align 4
+  %add.0 = fadd double %l.A.0, %l.B.0
+  %gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B.0, i64 8
+  %l.B.1 = load double, ptr %gep.B.1, align 4
+  %add.1 = fadd double %l.A.1, %l.B.1
+  %gep.res.0 = getelementptr inbounds nuw { double, double }, ptr %res, i64 %iv
+  store double %add.0, ptr %gep.res.0, align 4
+  %gep.res.1 = getelementptr inbounds nuw i8, ptr %gep.res.0, i64 8
+  store double %add.1, ptr %gep.res.1, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
similarity index 97%
rename from llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
rename to llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
index 146a1c4749f9b..de0d6a593e877 100644
--- a/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
@@ -1,8 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
-; RUN: opt -p loop-vectorize -force-vector-width=2 -enable-interleaved-mem-accesses=true -S %s | FileCheck --check-prefixes=VF2 %s
-; RUN: opt -p loop-vectorize -force-vector-width=4 -enable-interleaved-mem-accesses=true -S %s | FileCheck --check-prefixes=VF4 %s
+; RUN: opt -p loop-vectorize -force-vector-width=2 -S %s | FileCheck --check-prefixes=VF2 %s
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck --check-prefixes=VF4 %s
 
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx"
 
 define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr noalias %factor) {
 ; VF2-LABEL: define void @test_2xi64_unary_op_load_interleave_group(
@@ -14,17 +15,28 @@ define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr no
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2-NEXT:    [[TMP12:%.*]] = shl nsw i64 [[TMP10]], 1
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP12]]
 ; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 8
 ; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
 ; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP13]], align 8
+; VF2-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
 ; VF2-NEXT:    [[TMP3:%.*]] = fneg <2 x double> [[STRIDED_VEC]]
+; VF2-NEXT:    [[TMP14:%.*]] = fneg <2 x double> [[STRIDED_VEC3]]
 ; VF2-NEXT:    [[TMP4:%.*]] = fneg <2 x double> [[STRIDED_VEC1]]
+; VF2-NEXT:    [[TMP9:%.*]] = fneg <2 x double> [[STRIDED_VEC4]]
 ; VF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 ; VF2-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP14]], <2 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP13]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
@@ -128,20 +140,32 @@ define void @test_2xi64_unary_op_wide_load(ptr noalias %data, ptr noalias %A, pt
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2-NEXT:    [[TMP16:%.*]] = shl nsw i64 [[TMP14]], 1
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP0]]
 ; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 0
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 2
 ; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[TMP4:%.*]] = fneg <2 x double> [[WIDE_LOAD]]
+; VF2-NEXT:    [[TMP19:%.*]] = fneg <2 x double> [[WIDE_LOAD2]]
 ; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP16]]
 ; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP0]]
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 2
 ; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = fneg <2 x double> [[WIDE_LOAD1]]
+; VF2-NEXT:    [[TMP15:%.*]] = fneg <2 x double> [[WIDE_LOAD3]]
 ; VF2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 ; VF2-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP17:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC4:%.*]] = shufflevector <4 x double> [[TMP17]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x double> [[INTERLEAVED_VEC4]], ptr [[TMP20]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
index 3fca274a3bb12..95427d86c4d93 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
@@ -1,203 +1,359 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
-; RUN: opt -p loop-vectorize -S %s | FileCheck --check-prefixes=CHECK %s
+; RUN: opt -p loop-vectorize -force-vector-width=2 -S %s | FileCheck --check-prefixes=VF2 %s
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck --check-prefixes=VF4 %s
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
-target triple = "arm64-apple-macosx15.0.0"
+target triple = "arm64-apple-macosx"
 
-define void @test_complex_add_float(ptr %res, ptr noalias %A, ptr noalias %B, i64 %N) {
-; CHECK-LABEL: define void @test_complex_add_float(
-; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[GEP_A_0]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x float>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x float> [[WIDE_VEC2]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x float> [[WIDE_VEC2]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[WIDE_VEC5:%.*]] = load <8 x float>, ptr [[GEP_B_0]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <8 x float> [[WIDE_VEC5]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <8 x float> [[WIDE_VEC5]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <8 x float>, ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <8 x float> [[WIDE_VEC8]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <8 x float> [[WIDE_VEC8]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[STRIDED_VEC]], [[STRIDED_VEC6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[STRIDED_VEC3]], [[STRIDED_VEC9]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[STRIDED_VEC1]], [[STRIDED_VEC7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x float> [[STRIDED_VEC4]], [[STRIDED_VEC10]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x float> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[IV1]]
-; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[IV1]]
-; CHECK-NEXT:    [[L_A_0:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_A_2]], i64 4
-; CHECK-NEXT:    [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
-; CHECK-NEXT:    [[L_B_0:%.*]] = load float, ptr [[GEP_B_2]], align 4
-; CHECK-NEXT:    [[ADD_0:%.*]] = fadd float [[L_A_0]], [[L_B_0]]
-; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_B_2]], i64 4
-; CHECK-NEXT:    [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd float [[L_A_1]], [[L_B_1]]
-; CHECK-NEXT:    [[GEP_RES_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[IV1]]
-; CHECK-NEXT:    store float [[ADD_0]], ptr [[GEP_RES_0]], align 4
-; CHECK-NEXT:    [[GEP_RES_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_RES_0]], i64 4
-; CHECK-NEXT:    store float [[ADD_1]], ptr [[GEP_RES_1]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+define void @load_store_interleave_group(ptr noalias %data) {
+; VF2-LABEL: define void @load_store_interleave_group(
+; VF2-SAME: ptr noalias [[DATA:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2-NEXT:    [[TMP8:%.*]] = shl nsw i64 [[TMP6]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[MUL_2]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT:    store i64 [[L_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[ADD_1]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    store i64 [[L_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @load_store_interleave_group(
+; VF4-SAME: ptr noalias [[DATA:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF4-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; VF4-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; VF4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[STRIDED_VEC]], <4 x i64> [[STRIDED_VEC1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[MUL_2]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT:    store i64 [[L_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[ADD_1]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    store i64 [[L_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %mul.2 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %mul.2
+  %l.0 = load i64, ptr %data.0, align 8
+  store i64 %l.0, ptr %data.0, align 8
+  %add.1 = or disjoint i64 %mul.2, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %add.1
+  %l.1 = load i64, ptr %data.1, align 8
+  store i64 %l.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @load_store_interleave_group_different_objecs(ptr noalias %src, ptr noalias %dst) {
+; VF2-LABEL: define void @load_store_interleave_group_different_objecs(
+; VF2-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2-NEXT:    [[TMP8:%.*]] = shl nsw i64 [[TMP6]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP8]]
+; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
+; VF2-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP8]]
+; VF2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[MUL_2]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[SRC_0]], align 8
+; VF2-NEXT:    [[DST_0:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[MUL_2]]
+; VF2-NEXT:    store i64 [[L_0]], ptr [[DST_0]], align 8
+; VF2-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
+; VF2-NEXT:    [[SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[ADD_1]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[SRC_1]], align 8
+; VF2-NEXT:    [[DST_1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[ADD_1]]
+; VF2-NEXT:    store i64 [[L_1]], ptr [[DST_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @load_store_interleave_group_different_objecs(
+; VF4-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
+; VF4-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; VF4-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[STRIDED_VEC]], <4 x i64> [[STRIDED_VEC1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[MUL_2]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[SRC_0]], align 8
+; VF4-NEXT:    [[DST_0:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[MUL_2]]
+; VF4-NEXT:    store i64 [[L_0]], ptr [[DST_0]], align 8
+; VF4-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
+; VF4-NEXT:    [[SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[ADD_1]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[SRC_1]], align 8
+; VF4-NEXT:    [[DST_1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[ADD_1]]
+; VF4-NEXT:    store i64 [[L_1]], ptr [[DST_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
 
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %gep.A.0 = getelementptr inbounds nuw { float, float }, ptr %A, i64 %iv
-  %gep.B.0 = getelementptr inbounds nuw { float, float }, ptr %B, i64 %iv
-  %l.A.0 = load float, ptr %gep.A.0, align 4
-  %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A.0, i64 4
-  %l.A.1 = load float, ptr %gep.A.1, align 4
-  %l.B.0 = load float, ptr %gep.B.0, align 4
-  %add.0 = fadd float %l.A.0, %l.B.0
-  %gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B.0, i64 4
-  %l.B.1 = load float, ptr %gep.B.1, align 4
-  %add.1 = fadd float %l.A.1, %l.B.1
-  %gep.res.0 = getelementptr inbounds nuw { float, float }, ptr %res, i64 %iv
-  store float %add.0, ptr %gep.res.0, align 4
-  %gep.res.1 = getelementptr inbounds nuw i8, ptr %gep.res.0, i64 4
-  store float %add.1, ptr %gep.res.1, align 4
+  %mul.2 = shl nsw i64 %iv, 1
+  %src.0 = getelementptr inbounds i64, ptr %src, i64 %mul.2
+  %l.0 = load i64, ptr %src.0, align 8
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %mul.2
+  store i64 %l.0, ptr %dst.0, align 8
+  %add.1 = or disjoint i64 %mul.2, 1
+  %src.1 = getelementptr inbounds i64, ptr %src, i64 %add.1
+  %l.1 = load i64, ptr %src.1, align 8
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %add.1
+  store i64 %l.1, ptr %dst.1, align 8
   %iv.next = add nuw nsw i64 %iv, 1
-  %ec = icmp eq i64 %iv.next, %N
+  %ec = icmp eq i64 %iv.next, 100
   br i1 %ec, label %exit, label %loop
 
 exit:
   ret void
 }
 
-define void @test_complex_add_double(ptr %res, ptr noalias %A, ptr noalias %B, i64 %N) {
-; CHECK-LABEL: define void @test_complex_add_double(
-; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[WIDE_VEC5:%.*]] = load <4 x double>, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <4 x double>, ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[STRIDED_VEC6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[STRIDED_VEC9]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[STRIDED_VEC7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[STRIDED_VEC10]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
-; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_A_0]], i64 8
-; CHECK-NEXT:    [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
-; CHECK-NEXT:    [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
-; CHECK-NEXT:    [[ADD_0:%.*]] = fadd double [[L_A_0]], [[L_B_0]]
-; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_B_0]], i64 8
-; CHECK-NEXT:    [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd double [[L_A_1]], [[L_B_1]]
-; CHECK-NEXT:    [[GEP_RES_0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[IV]]
-; CHECK-NEXT:    store double [[ADD_0]], ptr [[GEP_RES_0]], align 4
-; CHECK-NEXT:    [[GEP_RES_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_RES_0]], i64 8
-; CHECK-NEXT:    store double [[ADD_1]], ptr [[GEP_RES_1]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+define void @single_wide_load_store_interleave_group(ptr noalias %src, ptr noalias %dst) {
+; VF2-LABEL: define void @single_wide_load_store_interleave_group(
+; VF2-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2-NEXT:    [[TMP9:%.*]] = shl nsw i64 [[TMP7]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; VF2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP9]]
+; VF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[WIDE_LOAD]], <2 x i64> [[WIDE_LOAD]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[WIDE_LOAD1]], <2 x i64> [[WIDE_LOAD1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC2:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC2]], ptr [[TMP8]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[SRC_0]], align 8
+; VF2-NEXT:    [[DST_0:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[MUL_2]]
+; VF2-NEXT:    store i64 [[L_0]], ptr [[DST_0]], align 8
+; VF2-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
+; VF2-NEXT:    [[DST_1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[ADD_1]]
+; VF2-NEXT:    store i64 [[L_0]], ptr [[DST_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @single_wide_load_store_interleave_group(
+; VF4-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF4-NEXT:    [[TMP9:%.*]] = shl nsw i64 [[TMP7]], 1
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> [[WIDE_LOAD]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD1]], <4 x i64> [[WIDE_LOAD1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC2:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC2]], ptr [[TMP8]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF4-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; VF4-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[SRC_0]], align 8
+; VF4-NEXT:    [[DST_0:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[MUL_2]]
+; VF4-NEXT:    store i64 [[L_0]], ptr [[DST_0]], align 8
+; VF4-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
+; VF4-NEXT:    [[DST_1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[ADD_1]]
+; VF4-NEXT:    store i64 [[L_0]], ptr [[DST_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
 
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %gep.A.0 = getelementptr inbounds nuw { double, double }, ptr %A, i64 %iv
-  %gep.B.0 = getelementptr inbounds nuw { double, double }, ptr %B, i64 %iv
-  %l.A.0 = load double, ptr %gep.A.0, align 4
-  %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A.0, i64 8
-  %l.A.1 = load double, ptr %gep.A.1, align 4
-  %l.B.0 = load double, ptr %gep.B.0, align 4
-  %add.0 = fadd double %l.A.0, %l.B.0
-  %gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B.0, i64 8
-  %l.B.1 = load double, ptr %gep.B.1, align 4
-  %add.1 = fadd double %l.A.1, %l.B.1
-  %gep.res.0 = getelementptr inbounds nuw { double, double }, ptr %res, i64 %iv
-  store double %add.0, ptr %gep.res.0, align 4
-  %gep.res.1 = getelementptr inbounds nuw i8, ptr %gep.res.0, i64 8
-  store double %add.1, ptr %gep.res.1, align 4
+  %mul.2 = shl nsw i64 %iv, 1
+  %src.0 = getelementptr inbounds i64, ptr %src, i64 %iv
+  %l.0 = load i64, ptr %src.0, align 8
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %mul.2
+  store i64 %l.0, ptr %dst.0, align 8
+  %add.1 = or disjoint i64 %mul.2, 1
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %add.1
+  store i64 %l.0, ptr %dst.1, align 8
   %iv.next = add nuw nsw i64 %iv, 1
-  %ec = icmp eq i64 %iv.next, %N
+  %ec = icmp eq i64 %iv.next, 100
   br i1 %ec, label %exit, label %loop
 
 exit:
diff --git a/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
deleted file mode 100644
index d8cb8d38e3067..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
+++ /dev/null
@@ -1,325 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
-; RUN: opt -p loop-vectorize -force-vector-width=2 -enable-interleaved-mem-accesses=true -S %s | FileCheck --check-prefixes=VF2 %s
-; RUN: opt -p loop-vectorize -force-vector-width=4 -enable-interleaved-mem-accesses=true -S %s | FileCheck --check-prefixes=VF4 %s
-
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
-
-define void @load_store_interleave_group(ptr noalias %data) {
-; VF2-LABEL: define void @load_store_interleave_group(
-; VF2-SAME: ptr noalias [[DATA:%.*]]) {
-; VF2-NEXT:  [[ENTRY:.*]]:
-; VF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; VF2:       [[VECTOR_PH]]:
-; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
-; VF2:       [[VECTOR_BODY]]:
-; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
-; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
-; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; VF2-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; VF2:       [[SCALAR_PH]]:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; VF2-NEXT:    br label %[[LOOP:.*]]
-; VF2:       [[LOOP]]:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
-; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[MUL_2]]
-; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
-; VF2-NEXT:    store i64 [[L_0]], ptr [[DATA_0]], align 8
-; VF2-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
-; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[ADD_1]]
-; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
-; VF2-NEXT:    store i64 [[L_1]], ptr [[DATA_1]], align 8
-; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
-; VF2:       [[EXIT]]:
-; VF2-NEXT:    ret void
-;
-; VF4-LABEL: define void @load_store_interleave_group(
-; VF4-SAME: ptr noalias [[DATA:%.*]]) {
-; VF4-NEXT:  [[ENTRY:.*]]:
-; VF4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; VF4:       [[VECTOR_PH]]:
-; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
-; VF4:       [[VECTOR_BODY]]:
-; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VF4-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
-; VF4-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8
-; VF4-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; VF4-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; VF4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[STRIDED_VEC]], <4 x i64> [[STRIDED_VEC1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; VF4-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; VF4:       [[SCALAR_PH]]:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; VF4-NEXT:    br label %[[LOOP:.*]]
-; VF4:       [[LOOP]]:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF4-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
-; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[MUL_2]]
-; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
-; VF4-NEXT:    store i64 [[L_0]], ptr [[DATA_0]], align 8
-; VF4-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
-; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[ADD_1]]
-; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
-; VF4-NEXT:    store i64 [[L_1]], ptr [[DATA_1]], align 8
-; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
-; VF4:       [[EXIT]]:
-; VF4-NEXT:    ret void
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %mul.2 = shl nsw i64 %iv, 1
-  %data.0 = getelementptr inbounds i64, ptr %data, i64 %mul.2
-  %l.0 = load i64, ptr %data.0, align 8
-  store i64 %l.0, ptr %data.0, align 8
-  %add.1 = or disjoint i64 %mul.2, 1
-  %data.1 = getelementptr inbounds i64, ptr %data, i64 %add.1
-  %l.1 = load i64, ptr %data.1, align 8
-  store i64 %l.1, ptr %data.1, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %ec = icmp eq i64 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @load_store_interleave_group_different_objecs(ptr noalias %src, ptr noalias %dst) {
-; VF2-LABEL: define void @load_store_interleave_group_different_objecs(
-; VF2-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
-; VF2-NEXT:  [[ENTRY:.*]]:
-; VF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; VF2:       [[VECTOR_PH]]:
-; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
-; VF2:       [[VECTOR_BODY]]:
-; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
-; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
-; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; VF2:       [[SCALAR_PH]]:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; VF2-NEXT:    br label %[[LOOP:.*]]
-; VF2:       [[LOOP]]:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
-; VF2-NEXT:    [[SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[MUL_2]]
-; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[SRC_0]], align 8
-; VF2-NEXT:    [[DST_0:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[MUL_2]]
-; VF2-NEXT:    store i64 [[L_0]], ptr [[DST_0]], align 8
-; VF2-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
-; VF2-NEXT:    [[SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[ADD_1]]
-; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[SRC_1]], align 8
-; VF2-NEXT:    [[DST_1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[ADD_1]]
-; VF2-NEXT:    store i64 [[L_1]], ptr [[DST_1]], align 8
-; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
-; VF2:       [[EXIT]]:
-; VF2-NEXT:    ret void
-;
-; VF4-LABEL: define void @load_store_interleave_group_different_objecs(
-; VF4-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
-; VF4-NEXT:  [[ENTRY:.*]]:
-; VF4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; VF4:       [[VECTOR_PH]]:
-; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
-; VF4:       [[VECTOR_BODY]]:
-; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VF4-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
-; VF4-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8
-; VF4-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; VF4-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[STRIDED_VEC]], <4 x i64> [[STRIDED_VEC1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
-; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; VF4-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; VF4:       [[SCALAR_PH]]:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; VF4-NEXT:    br label %[[LOOP:.*]]
-; VF4:       [[LOOP]]:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF4-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
-; VF4-NEXT:    [[SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[MUL_2]]
-; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[SRC_0]], align 8
-; VF4-NEXT:    [[DST_0:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[MUL_2]]
-; VF4-NEXT:    store i64 [[L_0]], ptr [[DST_0]], align 8
-; VF4-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
-; VF4-NEXT:    [[SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[ADD_1]]
-; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[SRC_1]], align 8
-; VF4-NEXT:    [[DST_1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[ADD_1]]
-; VF4-NEXT:    store i64 [[L_1]], ptr [[DST_1]], align 8
-; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
-; VF4:       [[EXIT]]:
-; VF4-NEXT:    ret void
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %mul.2 = shl nsw i64 %iv, 1
-  %src.0 = getelementptr inbounds i64, ptr %src, i64 %mul.2
-  %l.0 = load i64, ptr %src.0, align 8
-  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %mul.2
-  store i64 %l.0, ptr %dst.0, align 8
-  %add.1 = or disjoint i64 %mul.2, 1
-  %src.1 = getelementptr inbounds i64, ptr %src, i64 %add.1
-  %l.1 = load i64, ptr %src.1, align 8
-  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %add.1
-  store i64 %l.1, ptr %dst.1, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %ec = icmp eq i64 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @single_wide_load_store_interleave_group(ptr noalias %src, ptr noalias %dst) {
-; VF2-LABEL: define void @single_wide_load_store_interleave_group(
-; VF2-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
-; VF2-NEXT:  [[ENTRY:.*]]:
-; VF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; VF2:       [[VECTOR_PH]]:
-; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
-; VF2:       [[VECTOR_BODY]]:
-; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
-; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[WIDE_LOAD]], <2 x i64> [[WIDE_LOAD]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; VF2:       [[SCALAR_PH]]:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; VF2-NEXT:    br label %[[LOOP:.*]]
-; VF2:       [[LOOP]]:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
-; VF2-NEXT:    [[SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
-; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[SRC_0]], align 8
-; VF2-NEXT:    [[DST_0:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[MUL_2]]
-; VF2-NEXT:    store i64 [[L_0]], ptr [[DST_0]], align 8
-; VF2-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
-; VF2-NEXT:    [[DST_1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[ADD_1]]
-; VF2-NEXT:    store i64 [[L_0]], ptr [[DST_1]], align 8
-; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; VF2:       [[EXIT]]:
-; VF2-NEXT:    ret void
-;
-; VF4-LABEL: define void @single_wide_load_store_interleave_group(
-; VF4-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
-; VF4-NEXT:  [[ENTRY:.*]]:
-; VF4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; VF4:       [[VECTOR_PH]]:
-; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
-; VF4:       [[VECTOR_BODY]]:
-; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VF4-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> [[WIDE_LOAD]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
-; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; VF4:       [[SCALAR_PH]]:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; VF4-NEXT:    br label %[[LOOP:.*]]
-; VF4:       [[LOOP]]:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF4-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
-; VF4-NEXT:    [[SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
-; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[SRC_0]], align 8
-; VF4-NEXT:    [[DST_0:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[MUL_2]]
-; VF4-NEXT:    store i64 [[L_0]], ptr [[DST_0]], align 8
-; VF4-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
-; VF4-NEXT:    [[DST_1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[ADD_1]]
-; VF4-NEXT:    store i64 [[L_0]], ptr [[DST_1]], align 8
-; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; VF4:       [[EXIT]]:
-; VF4-NEXT:    ret void
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %mul.2 = shl nsw i64 %iv, 1
-  %src.0 = getelementptr inbounds i64, ptr %src, i64 %iv
-  %l.0 = load i64, ptr %src.0, align 8
-  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %mul.2
-  store i64 %l.0, ptr %dst.0, align 8
-  %add.1 = or disjoint i64 %mul.2, 1
-  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %add.1
-  store i64 %l.0, ptr %dst.1, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %ec = icmp eq i64 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
-
-exit:
-  ret void
-}

>From 89d4f131fd1d176c66f557d97de57d7912d2053a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 16 Mar 2025 10:27:03 +0000
Subject: [PATCH 16/19] [VPlan]

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   4 +
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   2 -
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   2 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 130 ++++++++++++++++++
 .../Transforms/Vectorize/VPlanTransforms.h    |   9 ++
 ...sform-narrow-interleave-to-widen-memory.ll |  36 ++---
 6 files changed, 154 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 08e125eca591e..4ffe672195e3c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7697,6 +7697,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
                            OrigLoop->getHeader()->getContext());
   VPlanTransforms::materializeBroadcasts(BestVPlan);
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
+
+  VPlanTransforms::narrowInterleaveGroups(
+      BestVPlan, BestVF,
+      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
   VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
   VPlanTransforms::removeDeadRecipes(BestVPlan);
   VPlanTransforms::convertToConcreteRecipes(BestVPlan);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 38eba370d6d81..7b5a65ef5799a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -900,8 +900,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
 
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   // FIXME: Model VF * UF computation completely in VPlan.
-  assert((!getVectorLoopRegion() || VFxUF.getNumUsers()) &&
-         "VFxUF expected to always have users");
   unsigned UF = getUF();
   if (VF.getNumUsers()) {
     Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c477070c8d95c..28008c6c80535 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -72,11 +72,11 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPBranchOnMaskSC:
   case VPScalarIVStepsSC:
   case VPPredInstPHISC:
+  case VPVectorPointerSC:
     return false;
   case VPBlendSC:
   case VPReductionEVLSC:
   case VPReductionSC:
-  case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9aae383d35d91..a31be7b1bd9cc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -30,6 +30,9 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
 
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
 using namespace llvm;
 
 void VPlanTransforms::VPInstructionsToVPRecipes(
@@ -2229,3 +2232,130 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
                            });
   }
 }
+
+/// Returns true if \p IR is a full interleave group with factor and number of
+/// members both equal to \p VF. The interleave group must also access the full
+/// vector width \p VectorRegWidth.
+static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
+                                         unsigned VF, VPTypeAnalysis &TypeInfo,
+                                         unsigned VectorRegWidth) {
+  if (!InterleaveR)
+    return false;
+  Type *GroupElementTy = nullptr;
+  if (InterleaveR->getStoredValues().empty()) {
+    GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
+    if (!all_of(InterleaveR->definedValues(),
+                [&TypeInfo, GroupElementTy](VPValue *Op) {
+                  return TypeInfo.inferScalarType(Op) == GroupElementTy;
+                }))
+      return false;
+  } else {
+    GroupElementTy =
+        TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
+    if (!all_of(InterleaveR->getStoredValues(),
+                [&TypeInfo, GroupElementTy](VPValue *Op) {
+                  return TypeInfo.inferScalarType(Op) == GroupElementTy;
+                }))
+      return false;
+  }
+
+  unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
+
+  auto IG = InterleaveR->getInterleaveGroup();
+  return IG->getFactor() == VF && IG->getNumMembers() == VF &&
+         GroupSize == VectorRegWidth;
+}
+
+void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
+                                             unsigned VectorRegWidth) {
+  using namespace llvm::VPlanPatternMatch;
+  VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
+  if (VF.isScalable() || !VectorLoop)
+    return;
+
+  VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+  Type *CanonicalIVType = CanonicalIV->getScalarType();
+  VPTypeAnalysis TypeInfo(CanonicalIVType);
+
+  unsigned FixedVF = VF.getFixedValue();
+  SmallVector<VPInterleaveRecipe *> StoreGroups;
+  for (auto &R : *VectorLoop->getEntryBasicBlock()) {
+    if (isa<VPCanonicalIVPHIRecipe>(&R) ||
+        match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+      continue;
+
+    // Bail out on recipes not supported at the moment:
+    //  * phi recipes other than the canonical induction
+    //  * recipes writing to memory except interleave groups
+    // Only support plans with a canonical induction phi.
+    if (R.isPhi())
+      return;
+
+    auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
+    if (R.mayWriteToMemory() && !InterleaveR)
+      return;
+
+    if (!InterleaveR)
+      continue;
+
+    // Bail out on non-consecutive interleave groups.
+    if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF, TypeInfo,
+                                      VectorRegWidth))
+      return;
+
+    // Skip read interleave groups.
+    if (InterleaveR->getStoredValues().empty())
+      continue;
+
+    if (!all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
+          VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
+          if (!DefR)
+            return false;
+          auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
+          return IR &&
+                 IR->getInterleaveGroup()->getFactor() ==
+                     IR->getInterleaveGroup()->getNumMembers() &&
+                 IR->getVPValue(Op.index()) == Op.value();
+        })) {
+      return;
+    }
+    StoreGroups.push_back(InterleaveR);
+  }
+
+  if (StoreGroups.empty())
+    return;
+
+  // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
+  auto Narrow = [](VPRecipeBase *R) -> VPValue * {
+    auto *LoadGroup = cast<VPInterleaveRecipe>(R);
+    // Narrow interleave group to wide load, as transformed VPlan will only
+    // process one original iteration.
+    auto *L = new VPWidenLoadRecipe(
+        *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
+        LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
+        /*Reverse=*/false, LoadGroup->getDebugLoc());
+    L->insertBefore(LoadGroup);
+    return L;
+  };
+
+  // Narrow operation tree rooted at store groups.
+  for (auto *StoreGroup : StoreGroups) {
+    VPValue *Res =
+        Narrow(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
+
+    auto *S = new VPWidenStoreRecipe(
+        *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
+        StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
+        /*Reverse=*/false, StoreGroup->getDebugLoc());
+    S->insertBefore(StoreGroup);
+    StoreGroup->eraseFromParent();
+  }
+
+  // Adjust induction to reflect that the transformed plan only processes one
+  // original iteration.
+  auto *CanIV = Plan.getCanonicalIV();
+  auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
+  Inc->setOperand(
+      1, Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+  removeDeadRecipes(Plan);
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 2ec07eea1ecce..e30a90e538d49 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -191,6 +191,15 @@ struct VPlanTransforms {
 
   /// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
   static void materializeBroadcasts(VPlan &Plan);
+
+  /// Try to convert a plan with interleave groups with VF elements to a plan
+  /// with the interleave groups replaced by wide loads and stores processing VF
+  /// elements, if all transformed interleave groups access the full vector
+  /// width (checked via \o VectorRegWidth). This effectively is a very simple
+  /// form of loop-aware SLP, where we use interleave groups to identify
+  /// candidates.
+  static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
+                                     unsigned VectorRegWidth);
 };
 
 } // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
index 95427d86c4d93..ad81570f6c03c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
@@ -20,19 +20,11 @@ define void @load_store_interleave_group(ptr noalias %data) {
 ; VF2-NEXT:    [[TMP8:%.*]] = shl nsw i64 [[TMP6]], 1
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
 ; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
-; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
-; VF2-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP5]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
+; VF2-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP2]], align 8
+; VF2-NEXT:    store <2 x i64> [[WIDE_LOAD1]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
@@ -133,21 +125,13 @@ define void @load_store_interleave_group_different_objecs(ptr noalias %src, ptr
 ; VF2-NEXT:    [[TMP8:%.*]] = shl nsw i64 [[TMP6]], 1
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
 ; VF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP8]]
-; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
-; VF2-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8
 ; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP1]]
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
-; VF2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP7]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP3]], align 8
+; VF2-NEXT:    store <2 x i64> [[WIDE_LOAD1]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:

>From b45c3aa94f2619d4628e3251b25457cbaf87c891 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 21 Mar 2025 09:17:06 +0000
Subject: [PATCH 17/19] !fixup adjustments

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index effb3c2b93e89..c6ccdcbe4befb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -30,9 +30,6 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
 
-#define LV_NAME "loop-vectorize"
-#define DEBUG_TYPE LV_NAME
-
 using namespace llvm;
 
 bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
@@ -2364,7 +2361,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
     return;
 
   // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
-  auto Narrow = [](VPRecipeBase *R) -> VPValue * {
+  auto NarrowOp = [](VPRecipeBase *R) -> VPValue * {
     if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
       // Narrow interleave group to wide load, as transformed VPlan will only
       // process one original iteration.
@@ -2392,11 +2389,11 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
     if (auto *Lane0 = dyn_cast<VPWidenRecipe>(
             StoreGroup->getStoredValues()[0]->getDefiningRecipe())) {
       for (unsigned Idx = 0, E = Lane0->getNumOperands(); Idx != E; ++Idx)
-        Lane0->setOperand(Idx,
-                          Narrow(Lane0->getOperand(Idx)->getDefiningRecipe()));
+        Lane0->setOperand(
+            Idx, NarrowOp(Lane0->getOperand(Idx)->getDefiningRecipe()));
       Res = Lane0;
     } else {
-      Res = Narrow(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
+      Res = NarrowOp(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
     }
 
     auto *S = new VPWidenStoreRecipe(

>From 86ac70a765e6b11c6a93786f1cf55492988df99b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 21 Mar 2025 11:21:20 +0000
Subject: [PATCH 18/19] !fixup address remaining comments.

---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  2 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 35 ++++++++++++-------
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1bb54632e2f05..b40f7e6f99068 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -72,11 +72,11 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPBranchOnMaskSC:
   case VPScalarIVStepsSC:
   case VPPredInstPHISC:
-  case VPVectorPointerSC:
     return false;
   case VPBlendSC:
   case VPReductionEVLSC:
   case VPReductionSC:
+  case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c6ccdcbe4befb..7ba6a9e2f28e2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2236,7 +2236,14 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
   }
 }
 
-static bool supportedLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) {
+/// Returns true if \p V used at \p Idx of a wide recipe is defined by a
+/// VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower
+/// recipe. A VPWidenLoadRecipe can be narrowed to an index independent load if
+/// it feeds all the wide ops at all indices (checked by via the operands of the
+/// wide recipe at lane0, \p R0). A VPInterleaveRecipe can be narrowed to a wide
+/// load, if \p V used at index \p Idx are defined at \p Idx of the interleave
+/// group.
+static bool canNarrowLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) {
   auto *DefR = V->getDefiningRecipe();
   if (!DefR)
     return false;
@@ -2312,6 +2319,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
     if (R.mayWriteToMemory() && !InterleaveR)
       return;
 
+    // All other ops are allowed, but we reject uses that cannot be converted
+    // when checking all allowed consumers (store interleave groups) below.
     if (!InterleaveR)
       continue;
 
@@ -2340,20 +2349,22 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
       continue;
     }
 
-    auto *Lane0 = dyn_cast_or_null<VPWidenRecipe>(
+    // Check if all values feeding InterleaveR are matching wide recipes, which
+    // operands that can be narrowed.
+    auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
         InterleaveR->getStoredValues()[0]->getDefiningRecipe());
-    if (!Lane0)
+    if (!WideMember0)
       return;
     for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
       auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe());
-      if (!R || R->getOpcode() != Lane0->getOpcode() || R->getNumOperands() > 2)
+      if (!R || R->getOpcode() != WideMember0->getOpcode() ||
+          R->getNumOperands() > 2)
         return;
-      if (any_of(R->operands(), [Lane0, Idx = I](VPValue *V) {
-            return !supportedLoad(Lane0, V, Idx);
+      if (any_of(R->operands(), [WideMember0, Idx = I](VPValue *V) {
+            return !canNarrowLoad(WideMember0, V, Idx);
           }))
         return;
     }
-
     StoreGroups.push_back(InterleaveR);
   }
 
@@ -2386,12 +2397,12 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
   // Narrow operation tree rooted at store groups.
   for (auto *StoreGroup : StoreGroups) {
     VPValue *Res = nullptr;
-    if (auto *Lane0 = dyn_cast<VPWidenRecipe>(
+    if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(
             StoreGroup->getStoredValues()[0]->getDefiningRecipe())) {
-      for (unsigned Idx = 0, E = Lane0->getNumOperands(); Idx != E; ++Idx)
-        Lane0->setOperand(
-            Idx, NarrowOp(Lane0->getOperand(Idx)->getDefiningRecipe()));
-      Res = Lane0;
+      for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
+        WideMember0->setOperand(
+            Idx, NarrowOp(WideMember0->getOperand(Idx)->getDefiningRecipe()));
+      Res = WideMember0;
     } else {
       Res = NarrowOp(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
     }

>From b79c14c05fe87cd2c8bcd18f72979805965b966d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 22 Mar 2025 10:57:08 +0000
Subject: [PATCH 19/19] !fixup finalize comments

---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 29 ++++++++++------
 ...nterleave-to-widen-memory-with-wide-ops.ll | 34 +++++++++----------
 2 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 7ba6a9e2f28e2..496bc5a550970 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2236,19 +2236,26 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
   }
 }
 
-/// Returns true if \p V used at \p Idx of a wide recipe is defined by a
-/// VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower
-/// recipe. A VPWidenLoadRecipe can be narrowed to an index independent load if
-/// it feeds all the wide ops at all indices (checked by via the operands of the
-/// wide recipe at lane0, \p R0). A VPInterleaveRecipe can be narrowed to a wide
-/// load, if \p V used at index \p Idx are defined at \p Idx of the interleave
-/// group.
-static bool canNarrowLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) {
+/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
+/// converted to a narrower recipe. \p V is used by a wide recipe \p WideMember
+/// that feeds a store interleave group at index \p Idx, \p WideMember0 is the
+/// recipe feeding the same interleave group at index 0. A VPWidenLoadRecipe can
+/// be narrowed to an index-independent load if it feeds all wide ops at all
+/// indices (checked by via the operands of the wide recipe at lane0, \p
+/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
+/// is defined at \p Idx of a load interleave group.
+static bool canNarrowLoad(VPWidenRecipe *WideMember0, VPWidenRecipe *WideMember,
+                          VPValue *V, unsigned Idx) {
   auto *DefR = V->getDefiningRecipe();
   if (!DefR)
     return false;
   if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
-    return !W->getMask() && is_contained(R0->operands(), V);
+    return !W->getMask() &&
+           all_of(zip(WideMember0->operands(), WideMember->operands()),
+                  [V](const auto P) {
+                    const auto &[WideMember0Op, WideMemberOp] = P;
+                    return (WideMember0Op == V) == (WideMemberOp == V);
+                  });
 
   if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
     return IR->getInterleaveGroup()->getFactor() ==
@@ -2360,8 +2367,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
       if (!R || R->getOpcode() != WideMember0->getOpcode() ||
           R->getNumOperands() > 2)
         return;
-      if (any_of(R->operands(), [WideMember0, Idx = I](VPValue *V) {
-            return !canNarrowLoad(WideMember0, V, Idx);
+      if (any_of(R->operands(), [WideMember0, Idx = I, R](VPValue *V) {
+            return !canNarrowLoad(WideMember0, R, V, Idx);
           }))
         return;
     }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
index 3f1b2ea3e0437..b250edb457da4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
@@ -195,9 +195,9 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor) {
 ; VF2-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[TMP0]], 1
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP23:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
-; VF2-NEXT:    [[TMP24:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP23]]
-; VF2-NEXT:    store <2 x i64> [[TMP24]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP8:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; VF2-NEXT:    store <2 x i64> [[TMP8]], ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -1099,30 +1099,28 @@ define void @test_2xi64_sub_of_wide_loads_ops_swapped(ptr noalias %data, ptr noa
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
 ; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; VF2-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
-; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
-; VF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; VF2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF2-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i64 0
-; VF2-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[BROADCAST_SPLAT4:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
 ; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 2
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i64 0
-; VF2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
-; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF2-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP11]], i64 0
-; VF2-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT5]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
+; VF2-NEXT:    [[BROADCAST_SPLAT6:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
 ; VF2-NEXT:    [[TMP13:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT4]], [[BROADCAST_SPLAT6]]
 ; VF2-NEXT:    [[TMP19:%.*]] = shl nsw i64 [[TMP0]], 1
 ; VF2-NEXT:    [[TMP20:%.*]] = shl nsw i64 [[TMP1]], 1
 ; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP19]]
 ; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP20]]
-; VF2-NEXT:    store <2 x i64> [[TMP12]], ptr [[DATA_0]], align 8
-; VF2-NEXT:    store <2 x i64> [[TMP13]], ptr [[DATA_1]], align 8
-; VF2-NEXT:    [[IV_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP14:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT]]
+; VF2-NEXT:    [[TMP15:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT6]], [[BROADCAST_SPLAT4]]
+; VF2-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP16]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC4:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC4]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
 ; VF2-NEXT:    br i1 [[EC]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:



More information about the llvm-commits mailing list