[llvm] Vplan narrow interleave (PR #106441)

Wed Aug 28 12:21:29 PDT 2024

https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/106441

None

>From ea52fe44ee1c26f6d361fd94b337c3c406ceaf42 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 28 Aug 2024 14:01:23 +0100
Subject: [PATCH 1/3] [VPlan] Use pointer to member 0 as VPInterleaveRecipe's
 pointer arg.

Update VPInterleaveRecipe to always use the pointer to member 0 as
pointer argument. This in many cases helps to remove unneeded index
adjustments and simplifies VPInterleaveRecipe::execute.

In some rare cases, the address of member 0 does not dominate the insert
position of the interleave group. In those cases a PtrAdd VPInstruction
is emitted to compute the address of member 0 based on the address of
the insert position. Alternatively we could hoist the recipe computing
the address of member 0.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  4 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  7 +++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 12 +++--
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 50 ++++++++++++++++---
 .../Transforms/Vectorize/VPlanTransforms.h    |  4 +-
 .../LoopVectorize/X86/interleave-cost.ll      | 17 +++----
 .../x86-interleaved-accesses-masked-group.ll  | 18 +++----
 ...aved-accesses-different-insert-position.ll |  8 +--
 .../LoopVectorize/interleaved-accesses.ll     | 25 ++++------
 .../LoopVectorize/vplan-printing.ll           |  5 +-
 10 files changed, 90 insertions(+), 60 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cb104c4ed2d03d..1e92bda5494248 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8993,8 +8993,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   // Interleave memory: for each Interleave Group we marked earlier as relevant
   // for this VPlan, replace the Recipes widening its memory instructions with a
   // single VPInterleaveRecipe at its insertion point.
-  VPlanTransforms::createInterleaveGroups(InterleaveGroups, RecipeBuilder,
-                                          CM.isScalarEpilogueAllowed());
+  VPlanTransforms::createInterleaveGroups(
+      *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
 
   for (ElementCount VF : Range)
     Plan->addVF(VF);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e7ea5cb23b90d3..b08cba798a9a9b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1330,6 +1330,13 @@ class VPInstruction : public VPRecipeWithIRFlags {
     assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint");
   }
 
+  VPInstruction(VPValue *Ptr, VPValue *Offset, bool InBounds, DebugLoc DL = {},
+                const Twine &Name = "")
+      : VPRecipeWithIRFlags(VPDef::VPInstructionSC,
+                            ArrayRef<VPValue *>({Ptr, Offset}),
+                            GEPFlagsTy(InBounds), DL),
+        Opcode(VPInstruction::PtrAdd), Name(Name.str()) {}
+
   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
                 FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = "");
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f84317ba51257a..df4e3a54812626 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -646,7 +646,9 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
            "can only generate first lane for PtrAdd");
     Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);
     Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
-    return Builder.CreatePtrAdd(Ptr, Addend, Name);
+    return Builder.CreatePtrAdd(Ptr, Addend, Name,
+                                isInBounds() ? GEPNoWrapFlags::inBounds()
+                                             : GEPNoWrapFlags::none());
   }
   case VPInstruction::ResumePhi: {
     if (Part != 0)
@@ -2393,7 +2395,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
 
   // Prepare for the new pointers.
   SmallVector<Value *, 2> AddrParts;
-  unsigned Index = Group->getIndex(Instr);
 
   // TODO: extend the masked interleaved-group support to reversed access.
   VPValue *BlockInMask = getMask();
@@ -2413,10 +2414,11 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
     Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
     Idx = State.Builder.CreateMul(Idx,
                                   State.Builder.getInt32(Group->getFactor()));
-    Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
     Idx = State.Builder.CreateNeg(Idx);
-  } else
-    Idx = State.Builder.getInt32(-Index);
+  } else {
+    // TODO: Drop redundant 0-index GEP as follow-up.
+    Idx = State.Builder.getInt32(0);
+  }
 
   VPValue *Addr = getAddr();
   for (unsigned Part = 0; Part < State.UF; Part++) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ee7c7cea0b7670..4dae6a54b017f6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -596,8 +596,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
           Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
           SE, nullptr, StartV, StepV, InsertPt);
 
-      auto *Recipe = new VPInstruction(VPInstruction::PtrAdd,
-                                       {PtrIV->getStartValue(), Steps},
+      auto *Recipe = new VPInstruction(PtrIV->getStartValue(), Steps, false,
                                        PtrIV->getDebugLoc(), "next.gep");
 
       Recipe->insertAfter(Steps);
@@ -1522,14 +1521,19 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
 }
 
 void VPlanTransforms::createInterleaveGroups(
-    const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> &InterleaveGroups,
+    VPlan &Plan,
+    const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
+        &InterleaveGroups,
     VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed) {
+  if (InterleaveGroups.empty())
+    return;
+
   // Interleave memory: for each Interleave Group we marked earlier as relevant
   // for this VPlan, replace the Recipes widening its memory instructions with a
   // single VPInterleaveRecipe at its insertion point.
+  VPDominatorTree VPDT;
+  VPDT.recalculate(Plan);
   for (const auto *IG : InterleaveGroups) {
-    auto *Recipe =
-        cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
     SmallVector<VPValue *, 4> StoredValues;
     for (unsigned i = 0; i < IG->getFactor(); ++i)
       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
@@ -1539,9 +1543,39 @@ void VPlanTransforms::createInterleaveGroups(
 
     bool NeedsMaskForGaps =
         IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed;
-    auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
-                                        Recipe->getMask(), NeedsMaskForGaps);
-    VPIG->insertBefore(Recipe);
+
+    Instruction *IRInsertPos = IG->getInsertPos();
+    auto *InsertPos =
+        cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
+    VPRecipeBase *IP = InsertPos;
+
+    // Get or create the start address for the interleave group.
+    auto *Start =
+        cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
+    VPValue *Addr = Start->getAddr();
+    if (!VPDT.properlyDominates(Addr->getDefiningRecipe(), InsertPos)) {
+      bool InBounds = false;
+      if (auto *gep = dyn_cast<GetElementPtrInst>(
+              getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
+        InBounds = gep->isInBounds();
+
+      // We cannot re-use the address of the first member because it does not
+      // dominate the insert position. Use the address of the insert position
+      // and create a PtrAdd to adjust the index to start at the first member.
+      APInt Offset(32,
+                   getLoadStoreType(IRInsertPos)->getScalarSizeInBits() / 8 *
+                       IG->getIndex(IRInsertPos),
+                   /*IsSigned=*/true);
+      VPValue *OffsetVPV = Plan.getOrAddLiveIn(
+          ConstantInt::get(IRInsertPos->getParent()->getContext(), -Offset));
+      Addr = new VPInstruction(InsertPos->getAddr(), OffsetVPV, InBounds);
+      Addr->getDefiningRecipe()->insertAfter(InsertPos);
+      IP = Addr->getDefiningRecipe();
+    }
+    auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
+                                        InsertPos->getMask(), NeedsMaskForGaps);
+    VPIG->insertAfter(IP);
+
     unsigned J = 0;
     for (unsigned i = 0; i < IG->getFactor(); ++i)
       if (Instruction *Member = IG->getMember(i)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index e714f69eeff1ab..434545c5ea67e1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -112,7 +112,9 @@ struct VPlanTransforms {
   // widening its memory instructions with a single VPInterleaveRecipe at its
   // insertion point.
   static void createInterleaveGroups(
-      const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> &InterleaveGroups,
+      VPlan &Plan,
+      const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
+          &InterleaveGroups,
       VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed);
 };
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index cc1d11754b27ed..9383799b181c82 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -89,12 +89,11 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP40]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP41:%.*]] = shl i64 [[TMP39]], 2
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[TMP41]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = load float, ptr [[P_INVAR]], align 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <2 x float> poison, float [[TMP42]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT28:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT27]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP43:%.*]] = or disjoint i64 [[TMP41]], 3
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[TMP43]]
-; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr float, ptr [[TMP44]], i32 -3
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr float, ptr [[TMP44]], i32 0
 ; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLAT]], <2 x float> [[BROADCAST_SPLAT28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <4 x float> [[TMP46]], <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP47]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
@@ -102,8 +101,8 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali
 ; CHECK-NEXT:    [[TMP48:%.*]] = load float, ptr [[P_INVAR]], align 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <2 x float> poison, float [[TMP48]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT30:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT29]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[TMP43]]
-; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr float, ptr [[TMP49]], i32 -3
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[TMP41]]
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr float, ptr [[TMP49]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT36:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLAT30]], <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLAT36]], <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC31:%.*]] = shufflevector <8 x float> [[TMP51]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
@@ -266,8 +265,7 @@ define void @geps_feeding_interleave_groups_with_reuse(ptr %arg, i64 %arg1, ptr
 ; CHECK-NEXT:    [[TMP35:%.*]] = fmul <2 x float> [[TMP34]], zeroinitializer
 ; CHECK-NEXT:    [[TMP36:%.*]] = fadd <2 x float> [[STRIDED_VEC16]], [[STRIDED_VEC20]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = fmul <2 x float> [[TMP36]], zeroinitializer
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[TMP28]], i64 12
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr float, ptr [[TMP38]], i32 -3
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr float, ptr [[TMP28]], i32 0
 ; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <2 x float> [[TMP31]], <2 x float> [[TMP33]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP41:%.*]] = shufflevector <2 x float> [[TMP35]], <2 x float> [[TMP37]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP42:%.*]] = shufflevector <4 x float> [[TMP40]], <4 x float> [[TMP41]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -490,11 +488,10 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP53]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[STRIDED_VEC34:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP50]]
 ; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[B]], <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP54]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison), !alias.scope [[META6:![0-9]+]]
-; CHECK-NEXT:    [[TMP55:%.*]] = or disjoint i64 [[TMP50]], 7
-; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP55]]
-; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr i32, ptr [[TMP56]], i32 -7
+; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr i32, ptr [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC34]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index 6b52023cfbcaec..968058134690bc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -1419,13 +1419,11 @@ define dso_local void @masked_strided2(ptr noalias nocapture readonly %p, ptr no
 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = or disjoint i32 [[TMP1]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC1]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = sub <8 x i8> zeroinitializer, [[TMP4]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP3]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 -1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP7]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP6]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
@@ -2555,13 +2553,11 @@ define dso_local void @masked_strided2_unknown_tc(ptr noalias nocapture readonly
 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = or disjoint i32 [[TMP2]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC3]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP5]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 -1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP9]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP8]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -2989,13 +2985,11 @@ define dso_local void @unconditional_masked_strided2_unknown_tc(ptr noalias noca
 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = or disjoint i32 [[TMP1]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC3]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[Q:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = sub <8 x i8> zeroinitializer, [[TMP4]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[Q:%.*]], i32 [[TMP3]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP7]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP6]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP8]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
index 5028718dc49d15..8773350bdb4243 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
@@ -18,7 +18,8 @@ define void @gep_for_first_member_does_not_dominate_insert_point(ptr %str, ptr n
 ; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[STR]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 -1
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i8>, ptr [[TMP41]], align 1
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i8> [[WIDE_VEC]], <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i8> [[WIDE_VEC]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i8> [[STRIDED_VEC2]], [[STRIDED_VEC]]
@@ -92,8 +93,9 @@ define void @test_ig_insert_pos_at_end_of_vpbb(ptr noalias %dst, ptr noalias %sr
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr { i16, i16, i16, i16 }, ptr [[SRC]], i64 [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 -2
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 -4
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP51]], align 2
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i16> [[STRIDED_VEC]], i32 3
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 29ce8457e8d65f..f14f38cf0e14b5 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -38,13 +38,11 @@ define void @test_array_load2_store2(i32 %C, i32 %D) {
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -115,9 +113,9 @@ define void @test_struct_array_load3_store3() {
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
@@ -270,10 +268,10 @@ define void @test_struct_store4(ptr noalias nocapture readonly %A, ptr noalias n
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
@@ -357,8 +355,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -28
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -24
 ; CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE3]], <4 x i32> [[REVERSE4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -669,15 +667,13 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -4
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -1286,19 +1282,16 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 6
-; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    store i32 [[X:%.*]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 -4
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index f18ed825a6b886..837dc6ccd7a935 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -348,10 +348,9 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
 ; CHECK-NEXT:     ir<%AB.0> = load from index 0
 ; CHECK-NEXT:     ir<%AB.1> = load from index 1
 ; CHECK-NEXT:     ir<%AB.3> = load from index 3
-; CHECK-NEXT:   CLONE ir<%iv.plus.3> = add vp<[[STEPS]]>, ir<3>
 ; CHECK-NEXT:   WIDEN ir<%add> = add nsw ir<%AB.0>, ir<%AB.1>
-; CHECK-NEXT:   CLONE ir<%gep.CD.3> = getelementptr inbounds ir<@CD>, ir<0>, ir<%iv.plus.3>
-; CHECK-NEXT:   INTERLEAVE-GROUP with factor 4 at <badref>, ir<%gep.CD.3>
+; CHECK-NEXT:   CLONE ir<%gep.CD.0> = getelementptr inbounds ir<@CD>, ir<0>, vp<[[STEPS]]>
+; CHECK-NEXT:   INTERLEAVE-GROUP with factor 4 at <badref>, ir<%gep.CD.0>
 ; CHECK-NEXT:     store ir<%add> to index 0
 ; CHECK-NEXT:     store ir<1> to index 1
 ; CHECK-NEXT:     store ir<2> to index 2

>From 4568bc6c1751a2a66332b924c5faf378d45d6858 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 15 Aug 2024 13:38:58 +0100
Subject: [PATCH 2/3] [LV] Add

---
 ...sform-narrow-interleave-to-widen-memory.ll |  779 ++++++++
 ...sform-narrow-interleave-to-widen-memory.ll | 1677 +++++++++++++++++
 2 files changed, 2456 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll

diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
new file mode 100644
index 00000000000000..d6cd44d0c4f0ca
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -0,0 +1,779 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake -S %s | FileCheck %s
+; https://github.com/llvm/llvm-project/issues/82936
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
+define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_4xi64(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 -3
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP13]], <16 x i64> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 1
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 2
+; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
+; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[DATA_3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 3
+; CHECK-NEXT:    [[L_3:%.*]] = load i64, ptr [[DATA_3]], align 8
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul i64 [[L_FACTOR]], [[L_3]]
+; CHECK-NEXT:    store i64 [[MUL_3]], ptr [[DATA_3]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i64 , i64, i64, i64 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i64 , i64, i64, i64 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %data.2 = getelementptr inbounds { i64 , i64, i64, i64 }, ptr %data, i64 %iv, i32 2
+  %l.2 = load i64, ptr %data.2, align 8
+  %mul.2 = mul i64 %l.factor, %l.2
+  store i64 %mul.2, ptr %data.2, align 8
+  %data.3 = getelementptr inbounds { i64 , i64, i64, i64 }, ptr %data, i64 %iv, i32 3
+  %l.3 = load i64, ptr %data.3, align 8
+  %mul.3 = mul i64 %l.factor, %l.3
+  store i64 %mul.3, ptr %data.3, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64_interleave_loads_order_flipped(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -1
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.0 = mul i64 %l.factor, %l.1
+  store i64 %mul.0, ptr %data.0, align 8
+  %mul.1 = mul i64 %l.factor, %l.0
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64_store_order_flipped_1(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.0, align 8
+  store i64 %mul.0, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64_store_order_flipped_2(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP8]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.0, ptr %data.1, align 8
+  store i64 %mul.1, ptr %data.0, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noalias %src.0, ptr noalias %src.1, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi64_different_loads_feeding_fmul(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[SRC_0:%.*]], ptr noalias [[SRC_1:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[GEP_SRC_0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <4 x i64> [[WIDE_LOAD2]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP14]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_SRC_0:%.*]] = load i64, ptr [[GEP_SRC_2]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[GEP_SRC_2]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_SRC_0]], [[L_0]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_SRC_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_SRC_1]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src.0 = getelementptr inbounds i64, ptr %src.0, i64 %iv
+  %l.src.0 = load i64, ptr %gep.src.0, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %gep.src.0, align 8
+  %mul.0 = mul i64 %l.src.0, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %gep.src.1 = getelementptr inbounds i64, ptr %src.1, i64 %iv
+  %l.src.1 = load i64, ptr %gep.src.1, align 8
+  %mul.1 = mul i64 %l.src.1, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_3xi64(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -2
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> [[TMP11]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP12]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
+; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 1
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 2
+; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
+; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %data.2 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 2
+  %l.2 = load i64, ptr %data.2, align 8
+  %mul.2 = mul i64 %l.factor, %l.2
+  store i64 %mul.2, ptr %data.2, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_2xi32(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i32 , ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i32, ptr %data.0, align 8
+  %mul.0 = mul i32 %l.factor, %l.0
+  store i32 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i32, ptr %data.1, align 8
+  %mul.1 = mul i32 %l.factor, %l.1
+  store i32%mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; CHECK-LABEL: define void @test_3xi32(
+; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 8, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <24 x i32>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 -2
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP12]], <16 x i32> [[TMP13]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP14]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV1]], i32 0
+; CHECK-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV1]], i32 1
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV1]], i32 2
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[L_FACTOR]], [[L_2]]
+; CHECK-NEXT:    store i32 [[MUL_2]], ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i32 , ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i32, ptr %data.0, align 8
+  %mul.0 = mul i32 %l.factor, %l.0
+  store i32 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i32, ptr %data.1, align 8
+  %mul.1 = mul i32 %l.factor, %l.1
+  store i32%mul.1, ptr %data.1, align 8
+  %data.2 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 2
+  %l.2 = load i32, ptr %data.2, align 8
+  %mul.2 = mul i32 %l.factor, %l.2
+  store i32 %mul.2, ptr %data.2, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
new file mode 100644
index 00000000000000..accd340a0d8ea8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/transform-narrow-interleave-to-widen-memory.ll
@@ -0,0 +1,1677 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=2 -S %s | FileCheck --check-prefixes=VF2 %s
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck --check-prefixes=VF4 %s
+; https://github.com/llvm/llvm-project/issues/82936
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; VF2-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP15]], i32 0
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP16]]
+; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP15]], i32 1
+; VF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]]
+; VF2-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 8
+; VF2-NEXT:    [[TMP22:%.*]] = insertelement <2 x i64> poison, i64 [[TMP20]], i32 0
+; VF2-NEXT:    [[TMP23:%.*]] = insertelement <2 x i64> [[TMP22]], i64 [[TMP21]], i32 1
+; VF2-NEXT:    [[TMP24:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP23]]
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP24]], i32 0
+; VF2-NEXT:    store i64 [[TMP25]], ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x i64> [[TMP24]], i32 1
+; VF2-NEXT:    store i64 [[TMP26]], ptr [[TMP19]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[TMP29:%.*]] = or disjoint i64 [[TMP28]], 1
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP29]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; VF4-NEXT:    [[TMP20:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
+; VF4-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
+; VF4-NEXT:    store i64 [[TMP21]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
+; VF4-NEXT:    store i64 [[TMP22]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP30]]
+; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3
+; VF4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP32]]
+; VF4-NEXT:    [[TMP34:%.*]] = load i64, ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP35:%.*]] = load i64, ptr [[TMP29]], align 8
+; VF4-NEXT:    [[TMP36:%.*]] = load i64, ptr [[TMP31]], align 8
+; VF4-NEXT:    [[TMP37:%.*]] = load i64, ptr [[TMP33]], align 8
+; VF4-NEXT:    [[TMP38:%.*]] = insertelement <4 x i64> poison, i64 [[TMP34]], i32 0
+; VF4-NEXT:    [[TMP39:%.*]] = insertelement <4 x i64> [[TMP38]], i64 [[TMP35]], i32 1
+; VF4-NEXT:    [[TMP40:%.*]] = insertelement <4 x i64> [[TMP39]], i64 [[TMP36]], i32 2
+; VF4-NEXT:    [[TMP41:%.*]] = insertelement <4 x i64> [[TMP40]], i64 [[TMP37]], i32 3
+; VF4-NEXT:    [[TMP42:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP41]]
+; VF4-NEXT:    [[TMP43:%.*]] = extractelement <4 x i64> [[TMP42]], i32 0
+; VF4-NEXT:    store i64 [[TMP43]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP44:%.*]] = extractelement <4 x i64> [[TMP42]], i32 1
+; VF4-NEXT:    store i64 [[TMP44]], ptr [[TMP29]], align 8
+; VF4-NEXT:    [[TMP45:%.*]] = extractelement <4 x i64> [[TMP42]], i32 2
+; VF4-NEXT:    store i64 [[TMP45]], ptr [[TMP31]], align 8
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i64> [[TMP42]], i32 3
+; VF4-NEXT:    store i64 [[TMP46]], ptr [[TMP33]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP48]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[TMP49:%.*]] = or disjoint i64 [[TMP48]], 1
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP49]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64_interleave_loads_order_flipped(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; VF2-NEXT:    [[TMP12:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP13]]
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP15]]
+; VF2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[TMP17]], i32 0
+; VF2-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[TMP18]], i32 1
+; VF2-NEXT:    [[TMP21:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP20]]
+; VF2-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP21]], i32 0
+; VF2-NEXT:    store i64 [[TMP22]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i64> [[TMP21]], i32 1
+; VF2-NEXT:    store i64 [[TMP23]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP24:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP24]], i32 0
+; VF2-NEXT:    store i64 [[TMP25]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x i64> [[TMP24]], i32 1
+; VF2-NEXT:    store i64 [[TMP26]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[TMP29:%.*]] = or disjoint i64 [[TMP28]], 1
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP29]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_interleave_loads_order_flipped(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; VF4-NEXT:    [[TMP20:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
+; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP21]]
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP23]]
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP25]]
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP27]]
+; VF4-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP22]], align 8
+; VF4-NEXT:    [[TMP30:%.*]] = load i64, ptr [[TMP24]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP33:%.*]] = insertelement <4 x i64> poison, i64 [[TMP29]], i32 0
+; VF4-NEXT:    [[TMP34:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP30]], i32 1
+; VF4-NEXT:    [[TMP35:%.*]] = insertelement <4 x i64> [[TMP34]], i64 [[TMP31]], i32 2
+; VF4-NEXT:    [[TMP36:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP32]], i32 3
+; VF4-NEXT:    [[TMP37:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP36]]
+; VF4-NEXT:    [[TMP38:%.*]] = extractelement <4 x i64> [[TMP37]], i32 0
+; VF4-NEXT:    store i64 [[TMP38]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP37]], i32 1
+; VF4-NEXT:    store i64 [[TMP39]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = extractelement <4 x i64> [[TMP37]], i32 2
+; VF4-NEXT:    store i64 [[TMP40]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP41:%.*]] = extractelement <4 x i64> [[TMP37]], i32 3
+; VF4-NEXT:    store i64 [[TMP41]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP42:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
+; VF4-NEXT:    [[TMP43:%.*]] = extractelement <4 x i64> [[TMP42]], i32 0
+; VF4-NEXT:    store i64 [[TMP43]], ptr [[TMP22]], align 8
+; VF4-NEXT:    [[TMP44:%.*]] = extractelement <4 x i64> [[TMP42]], i32 1
+; VF4-NEXT:    store i64 [[TMP44]], ptr [[TMP24]], align 8
+; VF4-NEXT:    [[TMP45:%.*]] = extractelement <4 x i64> [[TMP42]], i32 2
+; VF4-NEXT:    store i64 [[TMP45]], ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i64> [[TMP42]], i32 3
+; VF4-NEXT:    store i64 [[TMP46]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP48]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[TMP49:%.*]] = or disjoint i64 [[TMP48]], 1
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP49]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.0 = mul i64 %l.factor, %l.1
+  store i64 %mul.0, ptr %data.0, align 8
+  %mul.1 = mul i64 %l.factor, %l.0
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64_store_order_flipped_1(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; VF2-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT:    [[TMP13:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]]
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP16]]
+; VF2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i32 0
+; VF2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i64> [[TMP20]], i64 [[TMP19]], i32 1
+; VF2-NEXT:    [[TMP22:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP21]]
+; VF2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i64> [[TMP22]], i32 0
+; VF2-NEXT:    store i64 [[TMP23]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP22]], i32 1
+; VF2-NEXT:    store i64 [[TMP24]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    store i64 [[TMP25]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    store i64 [[TMP26]], ptr [[TMP17]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    [[TMP29:%.*]] = or disjoint i64 [[TMP28]], 1
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP29]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_store_order_flipped_1(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; VF4-NEXT:    [[TMP20:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
+; VF4-NEXT:    [[TMP21:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP22]]
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP24]]
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF4-NEXT:    [[TMP30:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP33:%.*]] = load i64, ptr [[TMP29]], align 8
+; VF4-NEXT:    [[TMP34:%.*]] = insertelement <4 x i64> poison, i64 [[TMP30]], i32 0
+; VF4-NEXT:    [[TMP35:%.*]] = insertelement <4 x i64> [[TMP34]], i64 [[TMP31]], i32 1
+; VF4-NEXT:    [[TMP36:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP32]], i32 2
+; VF4-NEXT:    [[TMP37:%.*]] = insertelement <4 x i64> [[TMP36]], i64 [[TMP33]], i32 3
+; VF4-NEXT:    [[TMP38:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP37]]
+; VF4-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP38]], i32 0
+; VF4-NEXT:    store i64 [[TMP39]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = extractelement <4 x i64> [[TMP38]], i32 1
+; VF4-NEXT:    store i64 [[TMP40]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP41:%.*]] = extractelement <4 x i64> [[TMP38]], i32 2
+; VF4-NEXT:    store i64 [[TMP41]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP42:%.*]] = extractelement <4 x i64> [[TMP38]], i32 3
+; VF4-NEXT:    store i64 [[TMP42]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP43:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
+; VF4-NEXT:    store i64 [[TMP43]], ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP44:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
+; VF4-NEXT:    store i64 [[TMP44]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP45:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
+; VF4-NEXT:    store i64 [[TMP45]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; VF4-NEXT:    store i64 [[TMP46]], ptr [[TMP29]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP48]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    [[TMP49:%.*]] = or disjoint i64 [[TMP48]], 1
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP49]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.0, align 8
+  store i64 %mul.0, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64_store_order_flipped_2(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; VF2-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
+; VF2-NEXT:    [[TMP13:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]]
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP16]]
+; VF2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i32 0
+; VF2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i64> [[TMP20]], i64 [[TMP19]], i32 1
+; VF2-NEXT:    [[TMP22:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP21]]
+; VF2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    store i64 [[TMP23]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    store i64 [[TMP24]], ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP22]], i32 0
+; VF2-NEXT:    store i64 [[TMP25]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x i64> [[TMP22]], i32 1
+; VF2-NEXT:    store i64 [[TMP26]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    [[TMP29:%.*]] = or disjoint i64 [[TMP28]], 1
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP29]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_store_order_flipped_2(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; VF4-NEXT:    [[TMP20:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP19]]
+; VF4-NEXT:    [[TMP21:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP22]]
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP24]]
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP28]]
+; VF4-NEXT:    [[TMP30:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP33:%.*]] = load i64, ptr [[TMP29]], align 8
+; VF4-NEXT:    [[TMP34:%.*]] = insertelement <4 x i64> poison, i64 [[TMP30]], i32 0
+; VF4-NEXT:    [[TMP35:%.*]] = insertelement <4 x i64> [[TMP34]], i64 [[TMP31]], i32 1
+; VF4-NEXT:    [[TMP36:%.*]] = insertelement <4 x i64> [[TMP35]], i64 [[TMP32]], i32 2
+; VF4-NEXT:    [[TMP37:%.*]] = insertelement <4 x i64> [[TMP36]], i64 [[TMP33]], i32 3
+; VF4-NEXT:    [[TMP38:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP37]]
+; VF4-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
+; VF4-NEXT:    store i64 [[TMP39]], ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
+; VF4-NEXT:    store i64 [[TMP40]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP41:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
+; VF4-NEXT:    store i64 [[TMP41]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP42:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; VF4-NEXT:    store i64 [[TMP42]], ptr [[TMP29]], align 8
+; VF4-NEXT:    [[TMP43:%.*]] = extractelement <4 x i64> [[TMP38]], i32 0
+; VF4-NEXT:    store i64 [[TMP43]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP44:%.*]] = extractelement <4 x i64> [[TMP38]], i32 1
+; VF4-NEXT:    store i64 [[TMP44]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP45:%.*]] = extractelement <4 x i64> [[TMP38]], i32 2
+; VF4-NEXT:    store i64 [[TMP45]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i64> [[TMP38]], i32 3
+; VF4-NEXT:    store i64 [[TMP46]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP48]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    [[TMP49:%.*]] = or disjoint i64 [[TMP48]], 1
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP49]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.0, ptr %data.1, align 8
+  store i64 %mul.1, ptr %data.0, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noalias %src.0, ptr noalias %src.1, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi64_different_loads_feeding_fmul(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[SRC_0:%.*]], ptr noalias [[SRC_1:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP8:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF2-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0
+; VF2-NEXT:    store i64 [[TMP9]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1
+; VF2-NEXT:    store i64 [[TMP10]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP11:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 1>
+; VF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP12]]
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]]
+; VF2-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> poison, i64 [[TMP16]], i32 0
+; VF2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> [[TMP18]], i64 [[TMP17]], i32 1
+; VF2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP21]], align 8
+; VF2-NEXT:    [[TMP22:%.*]] = mul <2 x i64> [[WIDE_LOAD2]], [[TMP19]]
+; VF2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i64> [[TMP22]], i32 0
+; VF2-NEXT:    store i64 [[TMP23]], ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP22]], i32 1
+; VF2-NEXT:    store i64 [[TMP24]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; VF2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV]]
+; VF2-NEXT:    [[L_SRC_0:%.*]] = load i64, ptr [[GEP_SRC_0]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = shl nsw i64 [[IV]], 1
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP26]]
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[GEP_SRC_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_SRC_0]], [[L_0]]
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[TMP27:%.*]] = or disjoint i64 [[TMP26]], 1
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP27]]
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV]]
+; VF2-NEXT:    [[L_SRC_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_SRC_1]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_different_loads_feeding_fmul(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[SRC_0:%.*]], ptr noalias [[SRC_1:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
+; VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP12]], i32 0
+; VF4-NEXT:    store i64 [[TMP13]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP12]], i32 1
+; VF4-NEXT:    store i64 [[TMP14]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP12]], i32 2
+; VF4-NEXT:    store i64 [[TMP15]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP12]], i32 3
+; VF4-NEXT:    store i64 [[TMP16]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = or disjoint <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; VF4-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
+; VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]]
+; VF4-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
+; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP20]]
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
+; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP22]]
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
+; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP24]]
+; VF4-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP21]], align 8
+; VF4-NEXT:    [[TMP28:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP30:%.*]] = insertelement <4 x i64> poison, i64 [[TMP26]], i32 0
+; VF4-NEXT:    [[TMP31:%.*]] = insertelement <4 x i64> [[TMP30]], i64 [[TMP27]], i32 1
+; VF4-NEXT:    [[TMP32:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP28]], i32 2
+; VF4-NEXT:    [[TMP33:%.*]] = insertelement <4 x i64> [[TMP32]], i64 [[TMP29]], i32 3
+; VF4-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i64, ptr [[TMP34]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP35]], align 8
+; VF4-NEXT:    [[TMP36:%.*]] = mul <4 x i64> [[WIDE_LOAD2]], [[TMP33]]
+; VF4-NEXT:    [[TMP37:%.*]] = extractelement <4 x i64> [[TMP36]], i32 0
+; VF4-NEXT:    store i64 [[TMP37]], ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP38:%.*]] = extractelement <4 x i64> [[TMP36]], i32 1
+; VF4-NEXT:    store i64 [[TMP38]], ptr [[TMP21]], align 8
+; VF4-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP36]], i32 2
+; VF4-NEXT:    store i64 [[TMP39]], ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = extractelement <4 x i64> [[TMP36]], i32 3
+; VF4-NEXT:    store i64 [[TMP40]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; VF4-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV]]
+; VF4-NEXT:    [[L_SRC_0:%.*]] = load i64, ptr [[GEP_SRC_0]], align 8
+; VF4-NEXT:    [[TMP42:%.*]] = shl nsw i64 [[IV]], 1
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP42]]
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[GEP_SRC_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_SRC_0]], [[L_0]]
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[TMP43:%.*]] = or disjoint i64 [[TMP42]], 1
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP43]]
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV]]
+; VF4-NEXT:    [[L_SRC_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_SRC_1]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src.0 = getelementptr inbounds i64, ptr %src.0, i64 %iv
+  %l.src.0 = load i64, ptr %gep.src.0, align 8
+  %1 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %1
+  %l.0 = load i64, ptr %gep.src.0, align 8
+  %mul.0 = mul i64 %l.src.0, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %3 = or disjoint i64 %1, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %3
+  %l.1 = load i64, ptr %data.1, align 8
+  %gep.src.1 = getelementptr inbounds i64, ptr %src.1, i64 %iv
+  %l.src.1 = load i64, ptr %gep.src.1, align 8
+  %mul.1 = mul i64 %l.src.1, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_3xi64(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i32 0
+; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> [[TMP8]], i64 [[TMP7]], i32 1
+; VF2-NEXT:    [[TMP10:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP9]]
+; VF2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0
+; VF2-NEXT:    store i64 [[TMP11]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP10]], i32 1
+; VF2-NEXT:    store i64 [[TMP12]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF2-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[TMP15]], i32 0
+; VF2-NEXT:    [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[TMP16]], i32 1
+; VF2-NEXT:    [[TMP19:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP18]]
+; VF2-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP19]], i32 0
+; VF2-NEXT:    store i64 [[TMP20]], ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP21:%.*]] = extractelement <2 x i64> [[TMP19]], i32 1
+; VF2-NEXT:    store i64 [[TMP21]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 2
+; VF2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 2
+; VF2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 8
+; VF2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = insertelement <2 x i64> poison, i64 [[TMP24]], i32 0
+; VF2-NEXT:    [[TMP27:%.*]] = insertelement <2 x i64> [[TMP26]], i64 [[TMP25]], i32 1
+; VF2-NEXT:    [[TMP28:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP27]]
+; VF2-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[TMP28]], i32 0
+; VF2-NEXT:    store i64 [[TMP29]], ptr [[TMP22]], align 8
+; VF2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i64> [[TMP28]], i32 1
+; VF2-NEXT:    store i64 [[TMP30]], ptr [[TMP23]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 2
+; VF2-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
+; VF2-NEXT:    [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
+; VF2-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_3xi64(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i32 0
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP11]], i32 1
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 2
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 3
+; VF4-NEXT:    [[TMP18:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP17]]
+; VF4-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0
+; VF4-NEXT:    store i64 [[TMP19]], ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1
+; VF4-NEXT:    store i64 [[TMP20]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2
+; VF4-NEXT:    store i64 [[TMP21]], ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3
+; VF4-NEXT:    store i64 [[TMP22]], ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP28:%.*]] = load i64, ptr [[TMP24]], align 8
+; VF4-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP30:%.*]] = load i64, ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = insertelement <4 x i64> poison, i64 [[TMP27]], i32 0
+; VF4-NEXT:    [[TMP32:%.*]] = insertelement <4 x i64> [[TMP31]], i64 [[TMP28]], i32 1
+; VF4-NEXT:    [[TMP33:%.*]] = insertelement <4 x i64> [[TMP32]], i64 [[TMP29]], i32 2
+; VF4-NEXT:    [[TMP34:%.*]] = insertelement <4 x i64> [[TMP33]], i64 [[TMP30]], i32 3
+; VF4-NEXT:    [[TMP35:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP34]]
+; VF4-NEXT:    [[TMP36:%.*]] = extractelement <4 x i64> [[TMP35]], i32 0
+; VF4-NEXT:    store i64 [[TMP36]], ptr [[TMP23]], align 8
+; VF4-NEXT:    [[TMP37:%.*]] = extractelement <4 x i64> [[TMP35]], i32 1
+; VF4-NEXT:    store i64 [[TMP37]], ptr [[TMP24]], align 8
+; VF4-NEXT:    [[TMP38:%.*]] = extractelement <4 x i64> [[TMP35]], i32 2
+; VF4-NEXT:    store i64 [[TMP38]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP35]], i32 3
+; VF4-NEXT:    store i64 [[TMP39]], ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 2
+; VF4-NEXT:    [[TMP41:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 2
+; VF4-NEXT:    [[TMP42:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP43:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP44:%.*]] = load i64, ptr [[TMP40]], align 8
+; VF4-NEXT:    [[TMP45:%.*]] = load i64, ptr [[TMP41]], align 8
+; VF4-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP42]], align 8
+; VF4-NEXT:    [[TMP47:%.*]] = load i64, ptr [[TMP43]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = insertelement <4 x i64> poison, i64 [[TMP44]], i32 0
+; VF4-NEXT:    [[TMP49:%.*]] = insertelement <4 x i64> [[TMP48]], i64 [[TMP45]], i32 1
+; VF4-NEXT:    [[TMP50:%.*]] = insertelement <4 x i64> [[TMP49]], i64 [[TMP46]], i32 2
+; VF4-NEXT:    [[TMP51:%.*]] = insertelement <4 x i64> [[TMP50]], i64 [[TMP47]], i32 3
+; VF4-NEXT:    [[TMP52:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[TMP51]]
+; VF4-NEXT:    [[TMP53:%.*]] = extractelement <4 x i64> [[TMP52]], i32 0
+; VF4-NEXT:    store i64 [[TMP53]], ptr [[TMP40]], align 8
+; VF4-NEXT:    [[TMP54:%.*]] = extractelement <4 x i64> [[TMP52]], i32 1
+; VF4-NEXT:    store i64 [[TMP54]], ptr [[TMP41]], align 8
+; VF4-NEXT:    [[TMP55:%.*]] = extractelement <4 x i64> [[TMP52]], i32 2
+; VF4-NEXT:    store i64 [[TMP55]], ptr [[TMP42]], align 8
+; VF4-NEXT:    [[TMP56:%.*]] = extractelement <4 x i64> [[TMP52]], i32 3
+; VF4-NEXT:    store i64 [[TMP56]], ptr [[TMP43]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP57:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP57]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 2
+; VF4-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
+; VF4-NEXT:    [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
+; VF4-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  store i64 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  store i64 %mul.1, ptr %data.1, align 8
+  %data.2 = getelementptr inbounds { i64 , i64, i64 }, ptr %data, i64 %iv, i32 2
+  %l.2 = load i64, ptr %data.2, align 8
+  %mul.2 = mul i64 %l.factor, %l.2
+  store i64 %mul.2, ptr %data.2, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+define void @test_2xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_2xi32(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
+; VF2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 8
+; VF2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = mul <2 x i32> [[TMP7]], [[TMP13]]
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
+; VF2-NEXT:    store i32 [[TMP15]], ptr [[TMP8]], align 8
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
+; VF2-NEXT:    store i32 [[TMP16]], ptr [[TMP9]], align 8
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP18]], align 8
+; VF2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i32 0
+; VF2-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i32 1
+; VF2-NEXT:    [[TMP23:%.*]] = mul <2 x i32> [[TMP7]], [[TMP22]]
+; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0
+; VF2-NEXT:    store i32 [[TMP24]], ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1
+; VF2-NEXT:    store i32 [[TMP25]], ptr [[TMP18]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF2-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF2-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi32(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i32 0
+; VF4-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 1
+; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 2
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP11]], i32 3
+; VF4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP16]], align 8
+; VF4-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP17]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP18]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = mul <4 x i32> [[TMP15]], [[TMP27]]
+; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP28]], i32 0
+; VF4-NEXT:    store i32 [[TMP29]], ptr [[TMP16]], align 8
+; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP28]], i32 1
+; VF4-NEXT:    store i32 [[TMP30]], ptr [[TMP17]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP28]], i32 2
+; VF4-NEXT:    store i32 [[TMP31]], ptr [[TMP18]], align 8
+; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP28]], i32 3
+; VF4-NEXT:    store i32 [[TMP32]], ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP34:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF4-NEXT:    [[TMP35:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP33]], align 8
+; VF4-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP34]], align 8
+; VF4-NEXT:    [[TMP39:%.*]] = load i32, ptr [[TMP35]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP36]], align 8
+; VF4-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[TMP37]], i32 0
+; VF4-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP38]], i32 1
+; VF4-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP39]], i32 2
+; VF4-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[TMP40]], i32 3
+; VF4-NEXT:    [[TMP45:%.*]] = mul <4 x i32> [[TMP15]], [[TMP44]]
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
+; VF4-NEXT:    store i32 [[TMP46]], ptr [[TMP33]], align 8
+; VF4-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
+; VF4-NEXT:    store i32 [[TMP47]], ptr [[TMP34]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
+; VF4-NEXT:    store i32 [[TMP48]], ptr [[TMP35]], align 8
+; VF4-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
+; VF4-NEXT:    store i32 [[TMP49]], ptr [[TMP36]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP50]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF4-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF4-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i32 , ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i32, ptr %data.0, align 8
+  %mul.0 = mul i32 %l.factor, %l.0
+  store i32 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i32, ptr %data.1, align 8
+  %mul.1 = mul i32 %l.factor, %l.1
+  store i32%mul.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
+; VF2-LABEL: define void @test_3xi32(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*]]:
+; VF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
+; VF2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 8
+; VF2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = mul <2 x i32> [[TMP7]], [[TMP13]]
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
+; VF2-NEXT:    store i32 [[TMP15]], ptr [[TMP8]], align 8
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
+; VF2-NEXT:    store i32 [[TMP16]], ptr [[TMP9]], align 8
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP18]], align 8
+; VF2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i32 0
+; VF2-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i32 1
+; VF2-NEXT:    [[TMP23:%.*]] = mul <2 x i32> [[TMP7]], [[TMP22]]
+; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0
+; VF2-NEXT:    store i32 [[TMP24]], ptr [[TMP17]], align 8
+; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1
+; VF2-NEXT:    store i32 [[TMP25]], ptr [[TMP18]], align 8
+; VF2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 2
+; VF2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 2
+; VF2-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP26]], align 8
+; VF2-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP27]], align 8
+; VF2-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> poison, i32 [[TMP28]], i32 0
+; VF2-NEXT:    [[TMP31:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP29]], i32 1
+; VF2-NEXT:    [[TMP32:%.*]] = mul <2 x i32> [[TMP7]], [[TMP31]]
+; VF2-NEXT:    [[TMP33:%.*]] = extractelement <2 x i32> [[TMP32]], i32 0
+; VF2-NEXT:    store i32 [[TMP33]], ptr [[TMP26]], align 8
+; VF2-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP32]], i32 1
+; VF2-NEXT:    store i32 [[TMP34]], ptr [[TMP27]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2:       [[SCALAR_PH]]:
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    br label %[[LOOP:.*]]
+; VF2:       [[LOOP]]:
+; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF2-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF2-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; VF2-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF2-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; VF2-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF2-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 2
+; VF2-NEXT:    [[L_2:%.*]] = load i32, ptr [[DATA_2]], align 8
+; VF2-NEXT:    [[MUL_2:%.*]] = mul i32 [[L_FACTOR]], [[L_2]]
+; VF2-NEXT:    store i32 [[MUL_2]], ptr [[DATA_2]], align 8
+; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_3xi32(
+; VF4-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*]]:
+; VF4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; VF4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; VF4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i32 0
+; VF4-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 1
+; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 2
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP11]], i32 3
+; VF4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; VF4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; VF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 0
+; VF4-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP16]], align 8
+; VF4-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP17]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP18]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = mul <4 x i32> [[TMP15]], [[TMP27]]
+; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP28]], i32 0
+; VF4-NEXT:    store i32 [[TMP29]], ptr [[TMP16]], align 8
+; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP28]], i32 1
+; VF4-NEXT:    store i32 [[TMP30]], ptr [[TMP17]], align 8
+; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP28]], i32 2
+; VF4-NEXT:    store i32 [[TMP31]], ptr [[TMP18]], align 8
+; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP28]], i32 3
+; VF4-NEXT:    store i32 [[TMP32]], ptr [[TMP19]], align 8
+; VF4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP34:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
+; VF4-NEXT:    [[TMP35:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 1
+; VF4-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP33]], align 8
+; VF4-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP34]], align 8
+; VF4-NEXT:    [[TMP39:%.*]] = load i32, ptr [[TMP35]], align 8
+; VF4-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP36]], align 8
+; VF4-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[TMP37]], i32 0
+; VF4-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP38]], i32 1
+; VF4-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP39]], i32 2
+; VF4-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[TMP40]], i32 3
+; VF4-NEXT:    [[TMP45:%.*]] = mul <4 x i32> [[TMP15]], [[TMP44]]
+; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
+; VF4-NEXT:    store i32 [[TMP46]], ptr [[TMP33]], align 8
+; VF4-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
+; VF4-NEXT:    store i32 [[TMP47]], ptr [[TMP34]], align 8
+; VF4-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
+; VF4-NEXT:    store i32 [[TMP48]], ptr [[TMP35]], align 8
+; VF4-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
+; VF4-NEXT:    store i32 [[TMP49]], ptr [[TMP36]], align 8
+; VF4-NEXT:    [[TMP50:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 2
+; VF4-NEXT:    [[TMP51:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 2
+; VF4-NEXT:    [[TMP52:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP53:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 2
+; VF4-NEXT:    [[TMP54:%.*]] = load i32, ptr [[TMP50]], align 8
+; VF4-NEXT:    [[TMP55:%.*]] = load i32, ptr [[TMP51]], align 8
+; VF4-NEXT:    [[TMP56:%.*]] = load i32, ptr [[TMP52]], align 8
+; VF4-NEXT:    [[TMP57:%.*]] = load i32, ptr [[TMP53]], align 8
+; VF4-NEXT:    [[TMP58:%.*]] = insertelement <4 x i32> poison, i32 [[TMP54]], i32 0
+; VF4-NEXT:    [[TMP59:%.*]] = insertelement <4 x i32> [[TMP58]], i32 [[TMP55]], i32 1
+; VF4-NEXT:    [[TMP60:%.*]] = insertelement <4 x i32> [[TMP59]], i32 [[TMP56]], i32 2
+; VF4-NEXT:    [[TMP61:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP57]], i32 3
+; VF4-NEXT:    [[TMP62:%.*]] = mul <4 x i32> [[TMP15]], [[TMP61]]
+; VF4-NEXT:    [[TMP63:%.*]] = extractelement <4 x i32> [[TMP62]], i32 0
+; VF4-NEXT:    store i32 [[TMP63]], ptr [[TMP50]], align 8
+; VF4-NEXT:    [[TMP64:%.*]] = extractelement <4 x i32> [[TMP62]], i32 1
+; VF4-NEXT:    store i32 [[TMP64]], ptr [[TMP51]], align 8
+; VF4-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[TMP62]], i32 2
+; VF4-NEXT:    store i32 [[TMP65]], ptr [[TMP52]], align 8
+; VF4-NEXT:    [[TMP66:%.*]] = extractelement <4 x i32> [[TMP62]], i32 3
+; VF4-NEXT:    store i32 [[TMP66]], ptr [[TMP53]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP67:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[TMP67]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4:       [[SCALAR_PH]]:
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    br label %[[LOOP:.*]]
+; VF4:       [[LOOP]]:
+; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
+; VF4-NEXT:    [[L_FACTOR:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 0
+; VF4-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[MUL_0:%.*]] = mul i32 [[L_FACTOR]], [[L_0]]
+; VF4-NEXT:    store i32 [[MUL_0]], ptr [[DATA_0]], align 8
+; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 1
+; VF4-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[MUL_1:%.*]] = mul i32 [[L_FACTOR]], [[L_1]]
+; VF4-NEXT:    store i32 [[MUL_1]], ptr [[DATA_1]], align 8
+; VF4-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 2
+; VF4-NEXT:    [[L_2:%.*]] = load i32, ptr [[DATA_2]], align 8
+; VF4-NEXT:    [[MUL_2:%.*]] = mul i32 [[L_FACTOR]], [[L_2]]
+; VF4-NEXT:    store i32 [[MUL_2]], ptr [[DATA_2]], align 8
+; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i32 , ptr %arrayidx, align 8
+  %data.0 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 0
+  %l.0 = load i32, ptr %data.0, align 8
+  %mul.0 = mul i32 %l.factor, %l.0
+  store i32 %mul.0, ptr %data.0, align 8
+  %data.1 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 1
+  %l.1 = load i32, ptr %data.1, align 8
+  %mul.1 = mul i32 %l.factor, %l.1
+  store i32%mul.1, ptr %data.1, align 8
+  %data.2 = getelementptr inbounds { i32, i32, i32 }, ptr %data, i64 %iv, i32 2
+  %l.2 = load i32, ptr %data.2, align 8
+  %mul.2 = mul i32 %l.factor, %l.2
+  store i32 %mul.2, ptr %data.2, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+;.
+; VF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; VF2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; VF2: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; VF2: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; VF2: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; VF2: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; VF2: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; VF2: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; VF2: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; VF2: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; VF2: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; VF2: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; VF2: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.
+; VF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF4: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; VF4: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; VF4: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; VF4: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; VF4: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; VF4: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; VF4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; VF4: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; VF4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; VF4: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; VF4: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; VF4: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; VF4: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.

>From 6a2778cb58f1ca89e322a8c37978b973b60d2751 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 21 Jul 2024 11:24:55 +0100
Subject: [PATCH 3/3] [VPlan] Add transformation to narrow interleave groups.

This patch adds a new narrowInterleaveGroups transfrom, which tries
convert a plan with interleave groups with VF elements to a plan that
instead replaces the interleave groups with wide loads and stores
processing VF elements.

This effectively is a very simple form of loop-aware SLP, where we
use interleave groups to identify candidates.

This initial version is quite restricted and hopefully serves as a
starting point for how to best model those kinds of transforms.

Depends on https://github.com/llvm/llvm-project/pull/106431.

Fixes https://github.com/llvm/llvm-project/issues/82936
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   3 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   4 +
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 128 ++++++++++++++++++
 .../Transforms/Vectorize/VPlanTransforms.h    |   2 +
 ...sform-narrow-interleave-to-widen-memory.ll |  74 ++++------
 5 files changed, 162 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1e92bda5494248..d1f9783a87afef 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7487,6 +7487,9 @@ LoopVectorizationPlanner::executePlan(
 
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
 
+  if (VPlanTransforms::narrowInterleaveGroups(BestVPlan, BestVF)) {
+    LLVM_DEBUG(dbgs() << "Narrowed interleave\n");
+  }
   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
                     << ", UF=" << BestUF << '\n');
   BestVPlan.setName("Final VPlan");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index df4e3a54812626..b837e8a7a350f4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -47,6 +47,9 @@ extern cl::opt<unsigned> ForceTargetInstructionCost;
 
 bool VPRecipeBase::mayWriteToMemory() const {
   switch (getVPDefID()) {
+  case VPInstructionSC: {
+    return !Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode());
+  }
   case VPInterleaveSC:
     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
   case VPWidenStoreEVLSC:
@@ -62,6 +65,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPBranchOnMaskSC:
   case VPScalarIVStepsSC:
   case VPPredInstPHISC:
+  case VPVectorPointerSC:
     return false;
   case VPBlendSC:
   case VPReductionEVLSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 4dae6a54b017f6..48bca286f0c485 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -672,6 +672,7 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
 void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
                                          unsigned BestUF,
                                          PredicatedScalarEvolution &PSE) {
+
   assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
   assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
   VPBasicBlock *ExitingVPBB =
@@ -713,6 +714,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
   // TODO: Further simplifications are possible
   //      1. Replace inductions with constants.
   //      2. Replace vector loop region with VPBasicBlock.
+  //
 }
 
 /// Sink users of \p FOR after the recipe defining the previous value \p
@@ -1589,3 +1591,129 @@ void VPlanTransforms::createInterleaveGroups(
       }
   }
 }
+
+static bool supportedLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) {
+  if (auto *W = dyn_cast_or_null<VPWidenLoadRecipe>(V->getDefiningRecipe())) {
+    if (W->getMask())
+      return false;
+    return !W->getMask() && (R0->getOperand(0) == V || R0->getOperand(1) == V);
+  }
+
+  if (auto *IR = dyn_cast_or_null<VPInterleaveRecipe>(V->getDefiningRecipe())) {
+    return IR->getInterleaveGroup()->getFactor() ==
+               IR->getInterleaveGroup()->getNumMembers() &&
+           IR->getVPValue(Idx) == V;
+  }
+  return false;
+}
+
+/// Returns true of \p IR is a consecutive interleave group with \p VF members.
+static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *IR,
+                                         ElementCount VF) {
+  if (!IR)
+    return false;
+  auto IG = IR->getInterleaveGroup();
+  return IG->getFactor() == IG->getNumMembers() &&
+         IG->getNumMembers() == VF.getKnownMinValue();
+}
+
+bool VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) {
+  using namespace llvm::VPlanPatternMatch;
+  if (VF.isScalable())
+    return false;
+
+  bool Changed = false;
+  SmallVector<VPInterleaveRecipe *> StoreGroups;
+  for (auto &R : make_early_inc_range(
+           *Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
+    if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
+        isa<VPCanonicalIVPHIRecipe>(&R))
+      continue;
+
+    // Bail out on recipes not supported at the moment:
+    //  * phi recipes other than the canonical induction
+    //  * recipes writing to memory except interleave groups
+    // Only support plans with a canonical induction phi.
+    if ((R.isPhi() && !isa<VPCanonicalIVPHIRecipe>(&R)) ||
+        (R.mayWriteToMemory() && !isa<VPInterleaveRecipe>(&R)))
+      return false;
+
+    auto *IR = dyn_cast<VPInterleaveRecipe>(&R);
+    if (!IR)
+      continue;
+
+    if (!isConsecutiveInterleaveGroup(IR, VF))
+      return false;
+    if (IR->getStoredValues().empty())
+      continue;
+
+    auto *Lane0 = dyn_cast_or_null<VPWidenRecipe>(
+        IR->getStoredValues()[0]->getDefiningRecipe());
+    if (!Lane0)
+      return false;
+    for (const auto &[I, V] : enumerate(IR->getStoredValues())) {
+      auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe());
+      if (!R || R->getOpcode() != Lane0->getOpcode())
+        return false;
+      // Work around captured structured bindings being a C++20 extension.
+      auto Idx = I;
+      if (any_of(R->operands(), [Lane0, Idx](VPValue *V) {
+            return !supportedLoad(Lane0, V, Idx);
+          }))
+        return false;
+    }
+
+    StoreGroups.push_back(IR);
+  }
+
+  // Narrow operation tree rooted at store groups.
+  for (auto *StoreGroup : StoreGroups) {
+    auto *Lane0 = cast<VPWidenRecipe>(
+        StoreGroup->getStoredValues()[0]->getDefiningRecipe());
+
+    unsigned LoadGroupIdx =
+        isa<VPInterleaveRecipe>(Lane0->getOperand(1)->getDefiningRecipe()) ? 1
+                                                                           : 0;
+    unsigned WideLoadIdx = 1 - LoadGroupIdx;
+    auto *LoadGroup = cast<VPInterleaveRecipe>(
+        Lane0->getOperand(LoadGroupIdx)->getDefiningRecipe());
+
+    auto *WideLoad = cast<VPWidenLoadRecipe>(
+        Lane0->getOperand(WideLoadIdx)->getDefiningRecipe());
+
+    // Narrow wide load to uniform scalar load, as transformed VPlan will only
+    // process one original iteration.
+    auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(),
+                                    WideLoad->operands(), true);
+    // Narrow interleave group to wide load, as transformed VPlan will only
+    // process one original iteration.
+    auto *L = new VPWidenLoadRecipe(
+        *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
+        LoadGroup->getAddr(), LoadGroup->getMask(), true, false,
+        LoadGroup->getDebugLoc());
+    L->insertBefore(LoadGroup);
+    N->insertBefore(LoadGroup);
+    Lane0->setOperand(LoadGroupIdx, L);
+    Lane0->setOperand(WideLoadIdx, N);
+
+    auto *S = new VPWidenStoreRecipe(
+        *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
+        StoreGroup->getAddr(), Lane0, nullptr, true, false,
+        StoreGroup->getDebugLoc());
+    S->insertBefore(StoreGroup);
+    StoreGroup->eraseFromParent();
+    Changed = true;
+  }
+
+  if (!Changed)
+    return false;
+
+  // Adjust induction to reflect that the transformed plan only processes one
+  // original iteration.
+  auto *CanIV = Plan.getCanonicalIV();
+  VPInstruction *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
+  Inc->setOperand(
+      1, Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+  removeDeadRecipes(Plan);
+  return true;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 434545c5ea67e1..3ca37a71cb8347 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -116,6 +116,8 @@ struct VPlanTransforms {
       const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
           &InterleaveGroups,
       VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed);
+
+  static bool narrowInterleaveGroups(VPlan &Plan, ElementCount VF);
 };
 
 } // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index d6cd44d0c4f0ca..ffe66ff0007fbb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -20,28 +20,16 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 -3
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP13]], <16 x i64> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i64> [[STRIDED_VEC2]], [[WIDE_LOAD]]
+; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -50,23 +38,23 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
-; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
 ; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
 ; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
-; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_2]], [[L_0]]
 ; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
 ; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 1
 ; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_2]], [[L_1]]
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
-; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 2
-; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
-; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]]
-; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
+; CHECK-NEXT:    [[DATA_4:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 2
+; CHECK-NEXT:    [[L_4:%.*]] = load i64, ptr [[DATA_4]], align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[L_2]], [[L_4]]
+; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[DATA_4]], align 8
 ; CHECK-NEXT:    [[DATA_3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 3
 ; CHECK-NEXT:    [[L_3:%.*]] = load i64, ptr [[DATA_3]], align 8
-; CHECK-NEXT:    [[MUL_3:%.*]] = mul i64 [[L_FACTOR]], [[L_3]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul i64 [[L_2]], [[L_3]]
 ; CHECK-NEXT:    store i64 [[MUL_3]], ptr [[DATA_3]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
@@ -129,13 +117,10 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[TMP3]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -212,14 +197,11 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP3]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -1
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -297,13 +279,10 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[TMP3]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -458,6 +437,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[GEP_SRC_0]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[TMP5]], 1
@@ -469,7 +449,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul <4 x i64> [[WIDE_LOAD2]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP14]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP13]], align 8
@@ -554,14 +534,12 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -2
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> [[TMP11]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP12]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -693,14 +671,12 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 -2
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP12]], <16 x i32> [[TMP13]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP14]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]