[llvm] [VPlan] Unroll by VF with Pack/Unpack. (PR #145188)

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Sat Jun 21 14:22:40 PDT 2025


https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/145188

None

>From 51b88fa31bebe2f6efb2e36897daaa30d785f00d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 21 Jun 2025 14:22:08 +0100
Subject: [PATCH 1/4] [VPlan] Explicit pack

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  14 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  18 +-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   4 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  22 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  29 +
 .../epilog-vectorization-widen-inductions.ll  |   4 +-
 .../AArch64/masked-call-scalarize.ll          |   4 +-
 .../partial-reduce-dot-product-neon.ll        |  32 +-
 .../AArch64/partial-reduce-dot-product.ll     |  32 +-
 .../AArch64/struct-return-cost.ll             |  55 +-
 .../AArch64/synthesize-mask-for-call.ll       |   3 +-
 .../RISCV/riscv-vector-reverse-output.ll      |   8 +-
 .../LoopVectorize/X86/cost-model.ll           |  52 +-
 .../X86/fixed-order-recurrence.ll             |   8 +-
 .../X86/interleave-opaque-pointers.ll         |   3 +-
 .../LoopVectorize/X86/load-deref-pred.ll      | 568 +++++++++---------
 .../LoopVectorize/X86/strided_load_cost.ll    |  96 +--
 .../LoopVectorize/first-order-recurrence.ll   |  16 +-
 .../Transforms/LoopVectorize/induction.ll     |  24 +-
 .../vplan-widen-struct-return.ll              |   5 +-
 20 files changed, 526 insertions(+), 471 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index a3f39f5ad7a29..c0675c7003298 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -360,17 +360,9 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
   // resulting vectors are stored in State, we will only generate the
   // insertelements once.
   Value *VectorValue = nullptr;
-  if (IsSingleScalar) {
-    VectorValue = GetBroadcastInstrs(ScalarValue);
-    set(Def, VectorValue);
-  } else {
-    assert(!VF.isScalable() && "VF is assumed to be non scalable.");
-    // Initialize packing with insertelements to start from poison.
-    VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
-    for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
-      VectorValue = packScalarIntoVectorizedValue(Def, VectorValue, Lane);
-    set(Def, VectorValue);
-  }
+  assert(IsSingleScalar && "replicates must be packed explicitly");
+  VectorValue = GetBroadcastInstrs(ScalarValue);
+  set(Def, VectorValue);
   Builder.restoreIP(OldIP);
   return VectorValue;
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f4163b0743a9a..1570fbc4eac73 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -970,6 +970,8 @@ class VPInstruction : public VPRecipeWithIRFlags,
     // Creates a step vector starting from 0 to VF with a step of 1.
     StepVector,
 
+    Pack,
+
   };
 
 private:
@@ -979,14 +981,6 @@ class VPInstruction : public VPRecipeWithIRFlags,
   /// An optional name that can be used for the generated IR instruction.
   const std::string Name;
 
-  /// Returns true if this VPInstruction generates scalar values for all lanes.
-  /// Most VPInstructions generate a single value per part, either vector or
-  /// scalar. VPReplicateRecipe takes care of generating multiple (scalar)
-  /// values per all lanes, stemming from an original ingredient. This method
-  /// identifies the (rare) cases of VPInstructions that do so as well, w/o an
-  /// underlying ingredient.
-  bool doesGeneratePerAllLanes() const;
-
   /// Returns true if we can generate a scalar for the first lane only if
   /// needed.
   bool canGenerateScalarForFirstLane() const;
@@ -1080,6 +1074,14 @@ class VPInstruction : public VPRecipeWithIRFlags,
   /// result is also a single scalar.
   bool isSingleScalar() const;
 
+  /// Returns true if this VPInstruction generates scalar values for all lanes.
+  /// Most VPInstructions generate a single value per part, either vector or
+  /// scalar. VPReplicateRecipe takes care of generating multiple (scalar)
+  /// values per all lanes, stemming from an original ingredient. This method
+  /// identifies the (rare) cases of VPInstructions that do so as well, w/o an
+  /// underlying ingredient.
+  bool doesGeneratePerAllLanes() const;
+
   /// Returns the symbolic name assigned to the VPInstruction.
   StringRef getName() const { return Name; }
 };
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 76da5b0314a8e..9004a09be5726 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -100,6 +100,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case VPInstruction::ExplicitVectorLength:
     return Type::getIntNTy(Ctx, 32);
   case Instruction::PHI:
+  case VPInstruction::Pack:
     // Infer the type of first operand only, as other operands of header phi's
     // may lead to infinite recursion.
     return inferScalarType(R->getOperand(0));
@@ -440,6 +441,9 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
     if (!VPBB->getParent())
       break;
     for (VPRecipeBase &R : *VPBB) {
+      if (isa<VPInstruction>(&R) &&
+          cast<VPInstruction>(&R)->getOpcode() == VPInstruction::Pack)
+        continue;
       Idx2Recipe.push_back(&R);
 
       // Save the end location of each USE.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3e12fdf9163eb..c9c78e00de3f8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -766,6 +766,15 @@ Value *VPInstruction::generate(VPTransformState &State) {
     return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
                                                 true, Name);
   }
+  case VPInstruction::Pack: {
+    assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
+    Value *WideValue = PoisonValue::get(
+        toVectorizedTy(State.TypeAnalysis.inferScalarType(this), State.VF));
+    for (unsigned Lane = 0; Lane < State.VF.getFixedValue(); ++Lane)
+      WideValue =
+          State.packScalarIntoVectorizedValue(getOperand(0), WideValue, Lane);
+    return WideValue;
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -894,10 +903,11 @@ void VPInstruction::execute(VPTransformState &State) {
   if (!hasResult())
     return;
   assert(GeneratedValue && "generate must produce a value");
-  assert(
-      (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly ||
-       State.VF.isScalar()) &&
-      "scalar value but not only first lane defined");
+  assert(((GeneratedValue->getType()->isVectorTy() ||
+           GeneratedValue->getType()->isStructTy()) ==
+              !GeneratesPerFirstLaneOnly ||
+          State.VF.isScalar()) &&
+         "scalar value but not only first lane defined");
   State.set(this, GeneratedValue,
             /*IsScalar*/ GeneratesPerFirstLaneOnly);
 }
@@ -923,6 +933,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case VPInstruction::WideIVStep:
   case VPInstruction::StepVector:
   case VPInstruction::ReductionStartVector:
+  case VPInstruction::Pack:
     return false;
   default:
     return true;
@@ -1063,6 +1074,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ReductionStartVector:
     O << "reduction-start-vector";
     break;
+  case VPInstruction::Pack:
+    O << "pack-into-vector";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d66733cac4d66..4b28be528d3ae 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1895,6 +1895,34 @@ static void removeBranchOnConst(VPlan &Plan) {
   }
 }
 
+static void materializePack(VPlan &Plan) {
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
+      if (!(RepR && !RepR->isSingleScalar()) &&
+          !(isa<VPInstruction>(&R) &&
+            cast<VPInstruction>(&R)->doesGeneratePerAllLanes()))
+        continue;
+      auto *Def = cast<VPSingleDefRecipe>(&R);
+      if (all_of(Def->users(),
+                 [Def](VPUser *U) { return U->usesScalars(Def); }))
+        continue;
+
+      auto *Pack = new VPInstruction(VPInstruction::Pack, {Def});
+      Pack->insertAfter(Def);
+      Def->replaceUsesWithIf(Pack, [Pack, Def](VPUser &U, unsigned) {
+        return &U != Pack && !U.usesScalars(Def) &&
+               (!isa<VPInstruction>(&U) ||
+                (cast<VPInstruction>(&U)->getOpcode() !=
+                     VPInstruction::ExtractLastElement &&
+                 cast<VPInstruction>(&U)->getOpcode() !=
+                     VPInstruction::ExtractPenultimateElement));
+      });
+    }
+  }
+}
+
 void VPlanTransforms::optimize(VPlan &Plan) {
   runPass(removeRedundantCanonicalIVs, Plan);
   runPass(removeRedundantInductionCasts, Plan);
@@ -1912,6 +1940,7 @@ void VPlanTransforms::optimize(VPlan &Plan) {
   runPass(createAndOptimizeReplicateRegions, Plan);
   runPass(mergeBlocksIntoPredecessors, Plan);
   runPass(licm, Plan);
+  runPass(materializePack, Plan);
 }
 
 // Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
index 83b35309638a7..0dd806f1350e2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
@@ -20,10 +20,10 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR_START_1:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[NEXT_GEP1]], i32 1
 ; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[NEXT_GEP1]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP2]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x ptr> [[TMP6]], ptr [[NEXT_GEP3]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <2 x ptr> [[TMP5]], zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
index 078f98f54525b..1ab39f567e710 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
@@ -109,10 +109,10 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = load double, ptr [[P2:%.*]], align 8
 ; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR3:[0-9]+]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR3]]
-; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
-; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR3]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR3]]
+; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
+; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i32 0
 ; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[TMP10]], i32 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = fcmp ule <2 x double> [[TMP8]], zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index 0fc324f720e60..b621ffdbae791 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -306,22 +306,6 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
-; CHECK-INTERLEAVED-NEXT:    [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
-; CHECK-INTERLEAVED-NEXT:    [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
-; CHECK-INTERLEAVED-NEXT:    [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
-; CHECK-INTERLEAVED-NEXT:    [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
-; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
-; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
-; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
-; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
-; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
 ; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2
@@ -338,6 +322,22 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
 ; CHECK-INTERLEAVED-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index a44b9bfb3b446..0b53894742722 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -655,22 +655,6 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = load i16, ptr [[TMP52]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = load i16, ptr [[TMP53]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = load i16, ptr [[TMP54]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
-; CHECK-INTERLEAVED-NEXT:    [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
-; CHECK-INTERLEAVED-NEXT:    [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
-; CHECK-INTERLEAVED-NEXT:    [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
-; CHECK-INTERLEAVED-NEXT:    [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
-; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
-; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
-; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
-; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
-; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
 ; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP55]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP56]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP57]], align 2
@@ -687,6 +671,22 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP68]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP139]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP140]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
 ; CHECK-INTERLEAVED-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
index c721493243734..49b4080da3434 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
@@ -16,12 +16,14 @@ target triple = "aarch64--linux-gnu"
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
 ;
 ; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
-; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
-; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
 ; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
-; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
-; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 
 define void @struct_return_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_widen(
@@ -65,16 +67,19 @@ exit:
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_b = extractvalue { half, half } %call, 1
 ;
 ; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
-; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
-; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+; CHECK-COST: Cost of 0 for VF 2: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
+; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
+; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
 ; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
-; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
-; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
 ; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
-; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
-; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 
 define void @struct_return_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_replicate(
@@ -119,28 +124,34 @@ exit:
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_b = extractvalue { half, half } %call, 1
 ;
 ; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
-; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
-; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+; CHECK-COST: Cost of 0 for VF 2: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
+; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
+; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
 ; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
-; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
-; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
 ; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
-; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
-; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
 ; CHECK-COST: Cost of Invalid for VF vscale x 1: REPLICATE ir<%call> = call @foo(ir<%in_val>)
-; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
-; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+; CHECK-COST: Cost of 0 for VF vscale x 1: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
+; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
+; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
 ; CHECK-COST: Cost of Invalid for VF vscale x 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
-; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
-; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+; CHECK-COST: Cost of 0 for VF vscale x 2: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
+; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
+; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
 ; CHECK-COST: Cost of Invalid for VF vscale x 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
-; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
-; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+; CHECK-COST: Cost of 0 for VF vscale x 4: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
+; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
+; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
 ; CHECK-COST: Cost of 10 for VF vscale x 8: WIDEN-CALL ir<%call> = call  @foo(ir<%in_val>, ir<true>) (using library function: scalable_vec_masked_foo)
 ; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
index 46b4762520ded..7d7fb8088c6fe 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
@@ -29,9 +29,10 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
 ; CHECK-NEXT:     WIDEN ir<%load> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     REPLICATE ir<%call> = call @foo(ir<%load>)
+; CHECK-NEXT:     EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
 ; CHECK-NEXT:     vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx>
-; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR2]]>, ir<%call>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR2]]>, vp<[[PACK]]>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:   No successors
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index 09b274de30214..758101d3e886e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -593,14 +593,14 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
 ; RV64-UF2-NEXT:    [[TMP6:%.*]] = load i7, ptr [[TMP4]], align 1
 ; RV64-UF2-NEXT:    [[TMP26:%.*]] = load i7, ptr [[TMP18]], align 1
 ; RV64-UF2-NEXT:    [[TMP27:%.*]] = load i7, ptr [[TMP19]], align 1
-; RV64-UF2-NEXT:    [[TMP28:%.*]] = insertelement <4 x i7> poison, i7 [[TMP5]], i32 0
-; RV64-UF2-NEXT:    [[TMP29:%.*]] = insertelement <4 x i7> [[TMP28]], i7 [[TMP6]], i32 1
-; RV64-UF2-NEXT:    [[TMP30:%.*]] = insertelement <4 x i7> [[TMP29]], i7 [[TMP26]], i32 2
-; RV64-UF2-NEXT:    [[TMP31:%.*]] = insertelement <4 x i7> [[TMP30]], i7 [[TMP27]], i32 3
 ; RV64-UF2-NEXT:    [[TMP32:%.*]] = load i7, ptr [[TMP20]], align 1
 ; RV64-UF2-NEXT:    [[TMP33:%.*]] = load i7, ptr [[TMP21]], align 1
 ; RV64-UF2-NEXT:    [[TMP34:%.*]] = load i7, ptr [[TMP22]], align 1
 ; RV64-UF2-NEXT:    [[TMP35:%.*]] = load i7, ptr [[TMP23]], align 1
+; RV64-UF2-NEXT:    [[TMP60:%.*]] = insertelement <4 x i7> poison, i7 [[TMP5]], i32 0
+; RV64-UF2-NEXT:    [[TMP61:%.*]] = insertelement <4 x i7> [[TMP60]], i7 [[TMP6]], i32 1
+; RV64-UF2-NEXT:    [[TMP62:%.*]] = insertelement <4 x i7> [[TMP61]], i7 [[TMP26]], i32 2
+; RV64-UF2-NEXT:    [[TMP31:%.*]] = insertelement <4 x i7> [[TMP62]], i7 [[TMP27]], i32 3
 ; RV64-UF2-NEXT:    [[TMP36:%.*]] = insertelement <4 x i7> poison, i7 [[TMP32]], i32 0
 ; RV64-UF2-NEXT:    [[TMP37:%.*]] = insertelement <4 x i7> [[TMP36]], i7 [[TMP33]], i32 1
 ; RV64-UF2-NEXT:    [[TMP38:%.*]] = insertelement <4 x i7> [[TMP37]], i7 [[TMP34]], i32 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 2c6fe4f5c808e..9e15a46828350 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -126,30 +126,30 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32
 ; CHECK-NEXT:    [[TMP36:%.*]] = load float, ptr [[TMP20]], align 4
 ; CHECK-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP38:%.*]] = load float, ptr [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x float> poison, float [[TMP35]], i32 0
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> [[TMP39]], float [[TMP36]], i32 1
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP37]], i32 2
-; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP38]], i32 3
 ; CHECK-NEXT:    [[TMP43:%.*]] = load float, ptr [[TMP23]], align 4
 ; CHECK-NEXT:    [[TMP44:%.*]] = load float, ptr [[TMP24]], align 4
 ; CHECK-NEXT:    [[TMP45:%.*]] = load float, ptr [[TMP25]], align 4
 ; CHECK-NEXT:    [[TMP46:%.*]] = load float, ptr [[TMP26]], align 4
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> poison, float [[TMP43]], i32 0
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP44]], i32 1
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP45]], i32 2
-; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP46]], i32 3
 ; CHECK-NEXT:    [[TMP51:%.*]] = load float, ptr [[TMP27]], align 4
 ; CHECK-NEXT:    [[TMP52:%.*]] = load float, ptr [[TMP28]], align 4
 ; CHECK-NEXT:    [[TMP53:%.*]] = load float, ptr [[TMP29]], align 4
 ; CHECK-NEXT:    [[TMP54:%.*]] = load float, ptr [[TMP30]], align 4
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x float> poison, float [[TMP51]], i32 0
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <4 x float> [[TMP55]], float [[TMP52]], i32 1
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <4 x float> [[TMP56]], float [[TMP53]], i32 2
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <4 x float> [[TMP57]], float [[TMP54]], i32 3
 ; CHECK-NEXT:    [[TMP59:%.*]] = load float, ptr [[TMP31]], align 4
 ; CHECK-NEXT:    [[TMP60:%.*]] = load float, ptr [[TMP32]], align 4
 ; CHECK-NEXT:    [[TMP61:%.*]] = load float, ptr [[TMP33]], align 4
 ; CHECK-NEXT:    [[TMP62:%.*]] = load float, ptr [[TMP34]], align 4
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x float> poison, float [[TMP35]], i32 0
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> [[TMP39]], float [[TMP36]], i32 1
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP37]], i32 2
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP38]], i32 3
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> poison, float [[TMP43]], i32 0
+; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP44]], i32 1
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP45]], i32 2
+; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP46]], i32 3
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x float> poison, float [[TMP51]], i32 0
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <4 x float> [[TMP55]], float [[TMP52]], i32 1
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <4 x float> [[TMP56]], float [[TMP53]], i32 2
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <4 x float> [[TMP57]], float [[TMP54]], i32 3
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x float> poison, float [[TMP59]], i32 0
 ; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x float> [[TMP63]], float [[TMP60]], i32 1
 ; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <4 x float> [[TMP64]], float [[TMP61]], i32 2
@@ -174,30 +174,30 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32
 ; CHECK-NEXT:    [[TMP84:%.*]] = load float, ptr [[TMP68]], align 4
 ; CHECK-NEXT:    [[TMP85:%.*]] = load float, ptr [[TMP69]], align 4
 ; CHECK-NEXT:    [[TMP86:%.*]] = load float, ptr [[TMP70]], align 4
-; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x float> poison, float [[TMP83]], i32 0
-; CHECK-NEXT:    [[TMP88:%.*]] = insertelement <4 x float> [[TMP87]], float [[TMP84]], i32 1
-; CHECK-NEXT:    [[TMP89:%.*]] = insertelement <4 x float> [[TMP88]], float [[TMP85]], i32 2
-; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x float> [[TMP89]], float [[TMP86]], i32 3
 ; CHECK-NEXT:    [[TMP91:%.*]] = load float, ptr [[TMP71]], align 4
 ; CHECK-NEXT:    [[TMP92:%.*]] = load float, ptr [[TMP72]], align 4
 ; CHECK-NEXT:    [[TMP93:%.*]] = load float, ptr [[TMP73]], align 4
 ; CHECK-NEXT:    [[TMP94:%.*]] = load float, ptr [[TMP74]], align 4
-; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <4 x float> poison, float [[TMP91]], i32 0
-; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x float> [[TMP95]], float [[TMP92]], i32 1
-; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <4 x float> [[TMP96]], float [[TMP93]], i32 2
-; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <4 x float> [[TMP97]], float [[TMP94]], i32 3
 ; CHECK-NEXT:    [[TMP99:%.*]] = load float, ptr [[TMP75]], align 4
 ; CHECK-NEXT:    [[TMP100:%.*]] = load float, ptr [[TMP76]], align 4
 ; CHECK-NEXT:    [[TMP101:%.*]] = load float, ptr [[TMP77]], align 4
 ; CHECK-NEXT:    [[TMP102:%.*]] = load float, ptr [[TMP78]], align 4
-; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x float> poison, float [[TMP99]], i32 0
-; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x float> [[TMP103]], float [[TMP100]], i32 1
-; CHECK-NEXT:    [[TMP105:%.*]] = insertelement <4 x float> [[TMP104]], float [[TMP101]], i32 2
-; CHECK-NEXT:    [[TMP106:%.*]] = insertelement <4 x float> [[TMP105]], float [[TMP102]], i32 3
 ; CHECK-NEXT:    [[TMP107:%.*]] = load float, ptr [[TMP79]], align 4
 ; CHECK-NEXT:    [[TMP108:%.*]] = load float, ptr [[TMP80]], align 4
 ; CHECK-NEXT:    [[TMP109:%.*]] = load float, ptr [[TMP81]], align 4
 ; CHECK-NEXT:    [[TMP110:%.*]] = load float, ptr [[TMP82]], align 4
+; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x float> poison, float [[TMP83]], i32 0
+; CHECK-NEXT:    [[TMP88:%.*]] = insertelement <4 x float> [[TMP87]], float [[TMP84]], i32 1
+; CHECK-NEXT:    [[TMP89:%.*]] = insertelement <4 x float> [[TMP88]], float [[TMP85]], i32 2
+; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x float> [[TMP89]], float [[TMP86]], i32 3
+; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <4 x float> poison, float [[TMP91]], i32 0
+; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x float> [[TMP95]], float [[TMP92]], i32 1
+; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <4 x float> [[TMP96]], float [[TMP93]], i32 2
+; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <4 x float> [[TMP97]], float [[TMP94]], i32 3
+; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x float> poison, float [[TMP99]], i32 0
+; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x float> [[TMP103]], float [[TMP100]], i32 1
+; CHECK-NEXT:    [[TMP105:%.*]] = insertelement <4 x float> [[TMP104]], float [[TMP101]], i32 2
+; CHECK-NEXT:    [[TMP106:%.*]] = insertelement <4 x float> [[TMP105]], float [[TMP102]], i32 3
 ; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <4 x float> poison, float [[TMP107]], i32 0
 ; CHECK-NEXT:    [[TMP112:%.*]] = insertelement <4 x float> [[TMP111]], float [[TMP108]], i32 1
 ; CHECK-NEXT:    [[TMP113:%.*]] = insertelement <4 x float> [[TMP112]], float [[TMP109]], i32 2
@@ -473,10 +473,10 @@ define i1 @any_of_cost(ptr %start, ptr %end) #0 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP6]], i64 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x ptr> [[TMP16]], ptr [[TMP15]], i32 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x ptr> [[TMP22]], ptr [[TMP15]], i32 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP18]], i32 0
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x ptr> [[TMP20]], ptr [[TMP19]], i32 1
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <2 x ptr> [[TMP17]], zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index 3361068c99220..c09beedf76ee6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -376,14 +376,14 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x ptr> [[TMP17]], ptr [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x ptr> [[TMP18]], ptr [[TMP15]], i32 2
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x ptr> [[TMP19]], ptr [[TMP16]], i32 3
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <4 x ptr> [[TMP19]], ptr [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x ptr> [[TMP42]], ptr [[TMP15]], i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x ptr> [[TMP43]], ptr [[TMP16]], i32 3
 ; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP21]], i32 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x ptr> [[TMP25]], ptr [[TMP22]], i32 1
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x ptr> [[TMP26]], ptr [[TMP23]], i32 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-opaque-pointers.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-opaque-pointers.ll
index 3a07bcca523ce..abb69ff67baf0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-opaque-pointers.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-opaque-pointers.ll
@@ -32,9 +32,10 @@ define void @test_pr55375_interleave_opaque_ptr(ptr %start, ptr %end) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x ptr> [[TMP9]], ptr [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x ptr> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x ptr> zeroinitializer, <2 x ptr> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x ptr> [[TMP12]], <4 x ptr> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x ptr> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index f4cd48de60243..333454bb6d27b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -163,30 +163,30 @@ define i32 @test_explicit_pred_generic(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
 ; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -320,30 +320,30 @@ define i32 @test_invariant_address(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
 ; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -352,30 +352,30 @@ define i32 @test_invariant_address(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP65:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP66:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP67:%.*]] = load i32, ptr [[ALLOCA]], align 4
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <4 x i32> poison, i32 [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP65]], i32 1
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP66]], i32 2
-; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP67]], i32 3
 ; CHECK-NEXT:    [[TMP72:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP73:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP74:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP75:%.*]] = load i32, ptr [[ALLOCA]], align 4
-; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> poison, i32 [[TMP72]], i32 0
-; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP73]], i32 1
-; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP74]], i32 2
-; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP75]], i32 3
 ; CHECK-NEXT:    [[TMP80:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP81:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP82:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP83:%.*]] = load i32, ptr [[ALLOCA]], align 4
-; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0
-; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
-; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
-; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
 ; CHECK-NEXT:    [[TMP88:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP89:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP90:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP91:%.*]] = load i32, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <4 x i32> poison, i32 [[TMP64]], i32 0
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP65]], i32 1
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP66]], i32 2
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP67]], i32 3
+; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> poison, i32 [[TMP72]], i32 0
+; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP73]], i32 1
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP74]], i32 2
+; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP75]], i32 3
+; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0
+; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
+; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
+; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
 ; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0
 ; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1
 ; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2
@@ -495,30 +495,30 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
 ; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i1> [[TMP64]], i1 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <4 x i1> [[TMP69]], i1 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP74]], i1 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP41]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP42]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP43]], i32 3
+; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
+; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i1> [[TMP79]], i1 [[TMP49]], i32 1
+; CHECK-NEXT:    [[TMP89:%.*]] = insertelement <4 x i1> [[TMP84]], i1 [[TMP50]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP89]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -773,30 +773,30 @@ define i32 @test_max_trip_count(i64 %len, ptr %test_base, i64 %n) {
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = load i1, ptr [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> poison, i1 [[TMP33]], i32 0
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = load i1, ptr [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> poison, i1 [[TMP41]], i32 0
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = load i1, ptr [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> poison, i1 [[TMP49]], i32 0
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = load i1, ptr [[TMP32]], align 1
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> poison, i1 [[TMP33]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> poison, i1 [[TMP41]], i32 0
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2
+; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> poison, i1 [[TMP49]], i32 0
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> poison, i1 [[TMP57]], i32 0
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2
@@ -933,30 +933,30 @@ define i32 @test_non_zero_start(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
 ; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -1138,30 +1138,30 @@ define i32 @test_non_unit_stride(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
 ; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -1186,30 +1186,30 @@ define i32 @test_non_unit_stride(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4
 ; CHECK-NEXT:    [[TMP82:%.*]] = load i32, ptr [[TMP66]], align 4
 ; CHECK-NEXT:    [[TMP83:%.*]] = load i32, ptr [[TMP67]], align 4
-; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0
-; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
-; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
-; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
 ; CHECK-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP68]], align 4
 ; CHECK-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP69]], align 4
 ; CHECK-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP70]], align 4
 ; CHECK-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP71]], align 4
-; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0
-; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1
-; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2
-; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3
 ; CHECK-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP72]], align 4
 ; CHECK-NEXT:    [[TMP97:%.*]] = load i32, ptr [[TMP73]], align 4
 ; CHECK-NEXT:    [[TMP98:%.*]] = load i32, ptr [[TMP74]], align 4
 ; CHECK-NEXT:    [[TMP99:%.*]] = load i32, ptr [[TMP75]], align 4
-; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0
-; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1
-; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2
-; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3
 ; CHECK-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP76]], align 4
 ; CHECK-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4
 ; CHECK-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP78]], align 4
 ; CHECK-NEXT:    [[TMP107:%.*]] = load i32, ptr [[TMP79]], align 4
+; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0
+; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
+; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
+; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
+; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0
+; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1
+; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2
+; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3
+; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0
+; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1
+; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2
+; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3
 ; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0
 ; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1
 ; CHECK-NEXT:    [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2
@@ -1330,30 +1330,30 @@ define i32 @neg_off_by_many(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
 ; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -1483,30 +1483,30 @@ define i32 @neg_off_by_one_iteration(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
 ; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -1636,30 +1636,30 @@ define i32 @neg_off_by_one_byte(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
 ; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -1798,30 +1798,30 @@ define i32 @test_constant_max(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = load i1, ptr [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> poison, i1 [[TMP33]], i32 0
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = load i1, ptr [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> poison, i1 [[TMP41]], i32 0
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = load i1, ptr [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> poison, i1 [[TMP49]], i32 0
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = load i1, ptr [[TMP32]], align 1
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> poison, i1 [[TMP33]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> poison, i1 [[TMP41]], i32 0
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2
+; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> poison, i1 [[TMP49]], i32 0
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> poison, i1 [[TMP57]], i32 0
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2
@@ -1959,30 +1959,30 @@ define i32 @test_allocsize(i64 %len, ptr %test_base) nofree nosync {
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
 ; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -2113,30 +2113,30 @@ define i32 @test_allocsize_array(i64 %len, ptr %test_base) nofree nosync {
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
 ; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -2277,30 +2277,30 @@ define i32 @test_allocsize_cond_deref(i1 %allzero, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
 ; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -2439,30 +2439,30 @@ define i32 @test_stride_three(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
 ; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -2487,30 +2487,30 @@ define i32 @test_stride_three(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4
 ; CHECK-NEXT:    [[TMP82:%.*]] = load i32, ptr [[TMP66]], align 4
 ; CHECK-NEXT:    [[TMP83:%.*]] = load i32, ptr [[TMP67]], align 4
-; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0
-; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
-; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
-; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
 ; CHECK-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP68]], align 4
 ; CHECK-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP69]], align 4
 ; CHECK-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP70]], align 4
 ; CHECK-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP71]], align 4
-; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0
-; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1
-; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2
-; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3
 ; CHECK-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP72]], align 4
 ; CHECK-NEXT:    [[TMP97:%.*]] = load i32, ptr [[TMP73]], align 4
 ; CHECK-NEXT:    [[TMP98:%.*]] = load i32, ptr [[TMP74]], align 4
 ; CHECK-NEXT:    [[TMP99:%.*]] = load i32, ptr [[TMP75]], align 4
-; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0
-; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1
-; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2
-; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3
 ; CHECK-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP76]], align 4
 ; CHECK-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4
 ; CHECK-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP78]], align 4
 ; CHECK-NEXT:    [[TMP107:%.*]] = load i32, ptr [[TMP79]], align 4
+; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0
+; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
+; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
+; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
+; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0
+; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1
+; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2
+; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3
+; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0
+; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1
+; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2
+; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3
 ; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0
 ; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1
 ; CHECK-NEXT:    [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2
@@ -2614,14 +2614,14 @@ define i32 @test_non_unit_stride_four(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = load i1, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i1, ptr [[TMP10]], align 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = load i1, ptr [[TMP11]], align 1
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i1> poison, i1 [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i1> [[TMP20]], i1 [[TMP17]], i32 1
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i1> [[TMP21]], i1 [[TMP18]], i32 2
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x i1> [[TMP22]], i1 [[TMP19]], i32 3
 ; CHECK-NEXT:    [[TMP24:%.*]] = load i1, ptr [[TMP12]], align 1
 ; CHECK-NEXT:    [[TMP25:%.*]] = load i1, ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i1, ptr [[TMP14]], align 1
 ; CHECK-NEXT:    [[TMP27:%.*]] = load i1, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP17]], i32 1
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP18]], i32 2
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP19]], i32 3
 ; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x i1> poison, i1 [[TMP24]], i32 0
 ; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <4 x i1> [[TMP28]], i1 [[TMP25]], i32 1
 ; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i1> [[TMP29]], i1 [[TMP26]], i32 2
@@ -2638,14 +2638,14 @@ define i32 @test_non_unit_stride_four(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP33]], align 4
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i32, ptr [[TMP34]], align 4
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i32, ptr [[TMP35]], align 4
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> poison, i32 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i32> [[TMP44]], i32 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i32> [[TMP45]], i32 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP43]], i32 3
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i32, ptr [[TMP36]], align 4
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP37]], align 4
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP38]], align 4
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP39]], align 4
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> poison, i32 [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i32> [[TMP44]], i32 [[TMP41]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i32> [[TMP45]], i32 [[TMP42]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP43]], i32 3
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i32> poison, i32 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i32> [[TMP52]], i32 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i32> [[TMP53]], i32 [[TMP50]], i32 2
@@ -2761,30 +2761,30 @@ define i32 @test_non_unit_stride_five(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
 ; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -2809,30 +2809,30 @@ define i32 @test_non_unit_stride_five(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4
 ; CHECK-NEXT:    [[TMP82:%.*]] = load i32, ptr [[TMP66]], align 4
 ; CHECK-NEXT:    [[TMP83:%.*]] = load i32, ptr [[TMP67]], align 4
-; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0
-; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
-; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
-; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
 ; CHECK-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP68]], align 4
 ; CHECK-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP69]], align 4
 ; CHECK-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP70]], align 4
 ; CHECK-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP71]], align 4
-; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0
-; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1
-; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2
-; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3
 ; CHECK-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP72]], align 4
 ; CHECK-NEXT:    [[TMP97:%.*]] = load i32, ptr [[TMP73]], align 4
 ; CHECK-NEXT:    [[TMP98:%.*]] = load i32, ptr [[TMP74]], align 4
 ; CHECK-NEXT:    [[TMP99:%.*]] = load i32, ptr [[TMP75]], align 4
-; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0
-; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1
-; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2
-; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3
 ; CHECK-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP76]], align 4
 ; CHECK-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4
 ; CHECK-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP78]], align 4
 ; CHECK-NEXT:    [[TMP107:%.*]] = load i32, ptr [[TMP79]], align 4
+; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0
+; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
+; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
+; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
+; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0
+; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1
+; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2
+; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3
+; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0
+; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1
+; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2
+; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3
 ; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0
 ; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1
 ; CHECK-NEXT:    [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2
@@ -2954,30 +2954,30 @@ define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
 ; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -3002,30 +3002,30 @@ define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4
 ; CHECK-NEXT:    [[TMP82:%.*]] = load i32, ptr [[TMP66]], align 4
 ; CHECK-NEXT:    [[TMP83:%.*]] = load i32, ptr [[TMP67]], align 4
-; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0
-; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
-; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
-; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
 ; CHECK-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP68]], align 4
 ; CHECK-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP69]], align 4
 ; CHECK-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP70]], align 4
 ; CHECK-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP71]], align 4
-; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0
-; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1
-; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2
-; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3
 ; CHECK-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP72]], align 4
 ; CHECK-NEXT:    [[TMP97:%.*]] = load i32, ptr [[TMP73]], align 4
 ; CHECK-NEXT:    [[TMP98:%.*]] = load i32, ptr [[TMP74]], align 4
 ; CHECK-NEXT:    [[TMP99:%.*]] = load i32, ptr [[TMP75]], align 4
-; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0
-; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1
-; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2
-; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3
 ; CHECK-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP76]], align 4
 ; CHECK-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4
 ; CHECK-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP78]], align 4
 ; CHECK-NEXT:    [[TMP107:%.*]] = load i32, ptr [[TMP79]], align 4
+; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0
+; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
+; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
+; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
+; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0
+; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1
+; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2
+; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3
+; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0
+; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1
+; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2
+; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3
 ; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0
 ; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1
 ; CHECK-NEXT:    [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2
@@ -3164,30 +3164,30 @@ define i32 @test_non_unit_stride_with_first_iteration_step_access(i64 %len, ptr
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP33]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP34]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP35]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP36]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP37]], align 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP38]], align 1
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP39]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
 ; CHECK-NEXT:    [[TMP64:%.*]] = load i1, ptr [[TMP40]], align 1
 ; CHECK-NEXT:    [[TMP65:%.*]] = load i1, ptr [[TMP41]], align 1
 ; CHECK-NEXT:    [[TMP66:%.*]] = load i1, ptr [[TMP42]], align 1
 ; CHECK-NEXT:    [[TMP67:%.*]] = load i1, ptr [[TMP43]], align 1
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <4 x i1> poison, i1 [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i1> [[TMP68]], i1 [[TMP65]], i32 1
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <4 x i1> [[TMP69]], i1 [[TMP66]], i32 2
-; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <4 x i1> [[TMP70]], i1 [[TMP67]], i32 3
 ; CHECK-NEXT:    [[TMP72:%.*]] = load i1, ptr [[TMP44]], align 1
 ; CHECK-NEXT:    [[TMP73:%.*]] = load i1, ptr [[TMP45]], align 1
 ; CHECK-NEXT:    [[TMP74:%.*]] = load i1, ptr [[TMP46]], align 1
 ; CHECK-NEXT:    [[TMP75:%.*]] = load i1, ptr [[TMP47]], align 1
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <4 x i1> poison, i1 [[TMP64]], i32 0
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i1> [[TMP68]], i1 [[TMP65]], i32 1
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <4 x i1> [[TMP69]], i1 [[TMP66]], i32 2
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <4 x i1> [[TMP70]], i1 [[TMP67]], i32 3
 ; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i1> poison, i1 [[TMP72]], i32 0
 ; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <4 x i1> [[TMP76]], i1 [[TMP73]], i32 1
 ; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i1> [[TMP77]], i1 [[TMP74]], i32 2
@@ -3212,30 +3212,30 @@ define i32 @test_non_unit_stride_with_first_iteration_step_access(i64 %len, ptr
 ; CHECK-NEXT:    [[TMP97:%.*]] = load i32, ptr [[TMP81]], align 4
 ; CHECK-NEXT:    [[TMP98:%.*]] = load i32, ptr [[TMP82]], align 4
 ; CHECK-NEXT:    [[TMP99:%.*]] = load i32, ptr [[TMP83]], align 4
-; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0
-; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1
-; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2
-; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3
 ; CHECK-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP84]], align 4
 ; CHECK-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP85]], align 4
 ; CHECK-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP86]], align 4
 ; CHECK-NEXT:    [[TMP107:%.*]] = load i32, ptr [[TMP87]], align 4
-; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0
-; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1
-; CHECK-NEXT:    [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2
-; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <4 x i32> [[TMP110]], i32 [[TMP107]], i32 3
 ; CHECK-NEXT:    [[TMP112:%.*]] = load i32, ptr [[TMP88]], align 4
 ; CHECK-NEXT:    [[TMP113:%.*]] = load i32, ptr [[TMP89]], align 4
 ; CHECK-NEXT:    [[TMP114:%.*]] = load i32, ptr [[TMP90]], align 4
 ; CHECK-NEXT:    [[TMP115:%.*]] = load i32, ptr [[TMP91]], align 4
-; CHECK-NEXT:    [[TMP116:%.*]] = insertelement <4 x i32> poison, i32 [[TMP112]], i32 0
-; CHECK-NEXT:    [[TMP117:%.*]] = insertelement <4 x i32> [[TMP116]], i32 [[TMP113]], i32 1
-; CHECK-NEXT:    [[TMP118:%.*]] = insertelement <4 x i32> [[TMP117]], i32 [[TMP114]], i32 2
-; CHECK-NEXT:    [[TMP119:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP115]], i32 3
 ; CHECK-NEXT:    [[TMP120:%.*]] = load i32, ptr [[TMP92]], align 4
 ; CHECK-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP93]], align 4
 ; CHECK-NEXT:    [[TMP122:%.*]] = load i32, ptr [[TMP94]], align 4
 ; CHECK-NEXT:    [[TMP123:%.*]] = load i32, ptr [[TMP95]], align 4
+; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0
+; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1
+; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2
+; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3
+; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0
+; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1
+; CHECK-NEXT:    [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2
+; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <4 x i32> [[TMP110]], i32 [[TMP107]], i32 3
+; CHECK-NEXT:    [[TMP116:%.*]] = insertelement <4 x i32> poison, i32 [[TMP112]], i32 0
+; CHECK-NEXT:    [[TMP117:%.*]] = insertelement <4 x i32> [[TMP116]], i32 [[TMP113]], i32 1
+; CHECK-NEXT:    [[TMP118:%.*]] = insertelement <4 x i32> [[TMP117]], i32 [[TMP114]], i32 2
+; CHECK-NEXT:    [[TMP119:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP115]], i32 3
 ; CHECK-NEXT:    [[TMP124:%.*]] = insertelement <4 x i32> poison, i32 [[TMP120]], i32 0
 ; CHECK-NEXT:    [[TMP125:%.*]] = insertelement <4 x i32> [[TMP124]], i32 [[TMP121]], i32 1
 ; CHECK-NEXT:    [[TMP126:%.*]] = insertelement <4 x i32> [[TMP125]], i32 [[TMP122]], i32 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
index ad8f1fb3ccd21..0a4dde0853dc3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -106,14 +106,6 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[TMP77:%.*]] = load i32, ptr [[TMP45]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP46]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[TMP79:%.*]] = load i32, ptr [[TMP47]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <8 x i32> poison, i32 [[TMP72]], i32 0
-; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <8 x i32> [[TMP80]], i32 [[TMP73]], i32 1
-; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <8 x i32> [[TMP81]], i32 [[TMP74]], i32 2
-; CHECK-NEXT:    [[TMP83:%.*]] = insertelement <8 x i32> [[TMP82]], i32 [[TMP75]], i32 3
-; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <8 x i32> [[TMP83]], i32 [[TMP76]], i32 4
-; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <8 x i32> [[TMP84]], i32 [[TMP77]], i32 5
-; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <8 x i32> [[TMP85]], i32 [[TMP78]], i32 6
-; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <8 x i32> [[TMP86]], i32 [[TMP79]], i32 7
 ; CHECK-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP48]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP49]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP50]], align 4, !tbaa [[TBAA1]]
@@ -122,14 +114,6 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[TMP93:%.*]] = load i32, ptr [[TMP53]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[TMP94:%.*]] = load i32, ptr [[TMP54]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[TMP95:%.*]] = load i32, ptr [[TMP55]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <8 x i32> poison, i32 [[TMP88]], i32 0
-; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <8 x i32> [[TMP96]], i32 [[TMP89]], i32 1
-; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <8 x i32> [[TMP97]], i32 [[TMP90]], i32 2
-; CHECK-NEXT:    [[TMP99:%.*]] = insertelement <8 x i32> [[TMP98]], i32 [[TMP91]], i32 3
-; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <8 x i32> [[TMP99]], i32 [[TMP92]], i32 4
-; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <8 x i32> [[TMP100]], i32 [[TMP93]], i32 5
-; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <8 x i32> [[TMP101]], i32 [[TMP94]], i32 6
-; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <8 x i32> [[TMP102]], i32 [[TMP95]], i32 7
 ; CHECK-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP56]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP57]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP58]], align 4, !tbaa [[TBAA1]]
@@ -138,14 +122,6 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[TMP109:%.*]] = load i32, ptr [[TMP61]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[TMP110:%.*]] = load i32, ptr [[TMP62]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP63]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP112:%.*]] = insertelement <8 x i32> poison, i32 [[TMP104]], i32 0
-; CHECK-NEXT:    [[TMP113:%.*]] = insertelement <8 x i32> [[TMP112]], i32 [[TMP105]], i32 1
-; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <8 x i32> [[TMP113]], i32 [[TMP106]], i32 2
-; CHECK-NEXT:    [[TMP115:%.*]] = insertelement <8 x i32> [[TMP114]], i32 [[TMP107]], i32 3
-; CHECK-NEXT:    [[TMP116:%.*]] = insertelement <8 x i32> [[TMP115]], i32 [[TMP108]], i32 4
-; CHECK-NEXT:    [[TMP117:%.*]] = insertelement <8 x i32> [[TMP116]], i32 [[TMP109]], i32 5
-; CHECK-NEXT:    [[TMP118:%.*]] = insertelement <8 x i32> [[TMP117]], i32 [[TMP110]], i32 6
-; CHECK-NEXT:    [[TMP119:%.*]] = insertelement <8 x i32> [[TMP118]], i32 [[TMP111]], i32 7
 ; CHECK-NEXT:    [[TMP120:%.*]] = load i32, ptr [[TMP64]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP65]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[TMP122:%.*]] = load i32, ptr [[TMP66]], align 4, !tbaa [[TBAA1]]
@@ -154,6 +130,30 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[TMP125:%.*]] = load i32, ptr [[TMP69]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP70]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[TMP127:%.*]] = load i32, ptr [[TMP71]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <8 x i32> poison, i32 [[TMP72]], i32 0
+; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <8 x i32> [[TMP80]], i32 [[TMP73]], i32 1
+; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <8 x i32> [[TMP81]], i32 [[TMP74]], i32 2
+; CHECK-NEXT:    [[TMP83:%.*]] = insertelement <8 x i32> [[TMP82]], i32 [[TMP75]], i32 3
+; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <8 x i32> [[TMP83]], i32 [[TMP76]], i32 4
+; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <8 x i32> [[TMP84]], i32 [[TMP77]], i32 5
+; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <8 x i32> [[TMP85]], i32 [[TMP78]], i32 6
+; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <8 x i32> [[TMP86]], i32 [[TMP79]], i32 7
+; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <8 x i32> poison, i32 [[TMP88]], i32 0
+; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <8 x i32> [[TMP96]], i32 [[TMP89]], i32 1
+; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <8 x i32> [[TMP97]], i32 [[TMP90]], i32 2
+; CHECK-NEXT:    [[TMP99:%.*]] = insertelement <8 x i32> [[TMP98]], i32 [[TMP91]], i32 3
+; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <8 x i32> [[TMP99]], i32 [[TMP92]], i32 4
+; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <8 x i32> [[TMP100]], i32 [[TMP93]], i32 5
+; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <8 x i32> [[TMP101]], i32 [[TMP94]], i32 6
+; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <8 x i32> [[TMP102]], i32 [[TMP95]], i32 7
+; CHECK-NEXT:    [[TMP112:%.*]] = insertelement <8 x i32> poison, i32 [[TMP104]], i32 0
+; CHECK-NEXT:    [[TMP113:%.*]] = insertelement <8 x i32> [[TMP112]], i32 [[TMP105]], i32 1
+; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <8 x i32> [[TMP113]], i32 [[TMP106]], i32 2
+; CHECK-NEXT:    [[TMP115:%.*]] = insertelement <8 x i32> [[TMP114]], i32 [[TMP107]], i32 3
+; CHECK-NEXT:    [[TMP116:%.*]] = insertelement <8 x i32> [[TMP115]], i32 [[TMP108]], i32 4
+; CHECK-NEXT:    [[TMP117:%.*]] = insertelement <8 x i32> [[TMP116]], i32 [[TMP109]], i32 5
+; CHECK-NEXT:    [[TMP118:%.*]] = insertelement <8 x i32> [[TMP117]], i32 [[TMP110]], i32 6
+; CHECK-NEXT:    [[TMP119:%.*]] = insertelement <8 x i32> [[TMP118]], i32 [[TMP111]], i32 7
 ; CHECK-NEXT:    [[TMP128:%.*]] = insertelement <8 x i32> poison, i32 [[TMP120]], i32 0
 ; CHECK-NEXT:    [[TMP129:%.*]] = insertelement <8 x i32> [[TMP128]], i32 [[TMP121]], i32 1
 ; CHECK-NEXT:    [[TMP130:%.*]] = insertelement <8 x i32> [[TMP129]], i32 [[TMP122]], i32 2
@@ -338,14 +338,6 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; MAX-BW-NEXT:    [[TMP77:%.*]] = load i32, ptr [[TMP45]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP46]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[TMP79:%.*]] = load i32, ptr [[TMP47]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP80:%.*]] = insertelement <8 x i32> poison, i32 [[TMP72]], i32 0
-; MAX-BW-NEXT:    [[TMP81:%.*]] = insertelement <8 x i32> [[TMP80]], i32 [[TMP73]], i32 1
-; MAX-BW-NEXT:    [[TMP82:%.*]] = insertelement <8 x i32> [[TMP81]], i32 [[TMP74]], i32 2
-; MAX-BW-NEXT:    [[TMP83:%.*]] = insertelement <8 x i32> [[TMP82]], i32 [[TMP75]], i32 3
-; MAX-BW-NEXT:    [[TMP84:%.*]] = insertelement <8 x i32> [[TMP83]], i32 [[TMP76]], i32 4
-; MAX-BW-NEXT:    [[TMP85:%.*]] = insertelement <8 x i32> [[TMP84]], i32 [[TMP77]], i32 5
-; MAX-BW-NEXT:    [[TMP86:%.*]] = insertelement <8 x i32> [[TMP85]], i32 [[TMP78]], i32 6
-; MAX-BW-NEXT:    [[TMP87:%.*]] = insertelement <8 x i32> [[TMP86]], i32 [[TMP79]], i32 7
 ; MAX-BW-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP48]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP49]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP50]], align 4, !tbaa [[TBAA1]]
@@ -354,14 +346,6 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; MAX-BW-NEXT:    [[TMP93:%.*]] = load i32, ptr [[TMP53]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[TMP94:%.*]] = load i32, ptr [[TMP54]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[TMP95:%.*]] = load i32, ptr [[TMP55]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP96:%.*]] = insertelement <8 x i32> poison, i32 [[TMP88]], i32 0
-; MAX-BW-NEXT:    [[TMP97:%.*]] = insertelement <8 x i32> [[TMP96]], i32 [[TMP89]], i32 1
-; MAX-BW-NEXT:    [[TMP98:%.*]] = insertelement <8 x i32> [[TMP97]], i32 [[TMP90]], i32 2
-; MAX-BW-NEXT:    [[TMP99:%.*]] = insertelement <8 x i32> [[TMP98]], i32 [[TMP91]], i32 3
-; MAX-BW-NEXT:    [[TMP100:%.*]] = insertelement <8 x i32> [[TMP99]], i32 [[TMP92]], i32 4
-; MAX-BW-NEXT:    [[TMP101:%.*]] = insertelement <8 x i32> [[TMP100]], i32 [[TMP93]], i32 5
-; MAX-BW-NEXT:    [[TMP102:%.*]] = insertelement <8 x i32> [[TMP101]], i32 [[TMP94]], i32 6
-; MAX-BW-NEXT:    [[TMP103:%.*]] = insertelement <8 x i32> [[TMP102]], i32 [[TMP95]], i32 7
 ; MAX-BW-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP56]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP57]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP58]], align 4, !tbaa [[TBAA1]]
@@ -370,14 +354,6 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; MAX-BW-NEXT:    [[TMP109:%.*]] = load i32, ptr [[TMP61]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[TMP110:%.*]] = load i32, ptr [[TMP62]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP63]], align 4, !tbaa [[TBAA1]]
-; MAX-BW-NEXT:    [[TMP112:%.*]] = insertelement <8 x i32> poison, i32 [[TMP104]], i32 0
-; MAX-BW-NEXT:    [[TMP113:%.*]] = insertelement <8 x i32> [[TMP112]], i32 [[TMP105]], i32 1
-; MAX-BW-NEXT:    [[TMP114:%.*]] = insertelement <8 x i32> [[TMP113]], i32 [[TMP106]], i32 2
-; MAX-BW-NEXT:    [[TMP115:%.*]] = insertelement <8 x i32> [[TMP114]], i32 [[TMP107]], i32 3
-; MAX-BW-NEXT:    [[TMP116:%.*]] = insertelement <8 x i32> [[TMP115]], i32 [[TMP108]], i32 4
-; MAX-BW-NEXT:    [[TMP117:%.*]] = insertelement <8 x i32> [[TMP116]], i32 [[TMP109]], i32 5
-; MAX-BW-NEXT:    [[TMP118:%.*]] = insertelement <8 x i32> [[TMP117]], i32 [[TMP110]], i32 6
-; MAX-BW-NEXT:    [[TMP119:%.*]] = insertelement <8 x i32> [[TMP118]], i32 [[TMP111]], i32 7
 ; MAX-BW-NEXT:    [[TMP120:%.*]] = load i32, ptr [[TMP64]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP65]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[TMP122:%.*]] = load i32, ptr [[TMP66]], align 4, !tbaa [[TBAA1]]
@@ -386,6 +362,30 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; MAX-BW-NEXT:    [[TMP125:%.*]] = load i32, ptr [[TMP69]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP70]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[TMP127:%.*]] = load i32, ptr [[TMP71]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP80:%.*]] = insertelement <8 x i32> poison, i32 [[TMP72]], i32 0
+; MAX-BW-NEXT:    [[TMP81:%.*]] = insertelement <8 x i32> [[TMP80]], i32 [[TMP73]], i32 1
+; MAX-BW-NEXT:    [[TMP82:%.*]] = insertelement <8 x i32> [[TMP81]], i32 [[TMP74]], i32 2
+; MAX-BW-NEXT:    [[TMP83:%.*]] = insertelement <8 x i32> [[TMP82]], i32 [[TMP75]], i32 3
+; MAX-BW-NEXT:    [[TMP84:%.*]] = insertelement <8 x i32> [[TMP83]], i32 [[TMP76]], i32 4
+; MAX-BW-NEXT:    [[TMP85:%.*]] = insertelement <8 x i32> [[TMP84]], i32 [[TMP77]], i32 5
+; MAX-BW-NEXT:    [[TMP86:%.*]] = insertelement <8 x i32> [[TMP85]], i32 [[TMP78]], i32 6
+; MAX-BW-NEXT:    [[TMP87:%.*]] = insertelement <8 x i32> [[TMP86]], i32 [[TMP79]], i32 7
+; MAX-BW-NEXT:    [[TMP96:%.*]] = insertelement <8 x i32> poison, i32 [[TMP88]], i32 0
+; MAX-BW-NEXT:    [[TMP97:%.*]] = insertelement <8 x i32> [[TMP96]], i32 [[TMP89]], i32 1
+; MAX-BW-NEXT:    [[TMP98:%.*]] = insertelement <8 x i32> [[TMP97]], i32 [[TMP90]], i32 2
+; MAX-BW-NEXT:    [[TMP99:%.*]] = insertelement <8 x i32> [[TMP98]], i32 [[TMP91]], i32 3
+; MAX-BW-NEXT:    [[TMP100:%.*]] = insertelement <8 x i32> [[TMP99]], i32 [[TMP92]], i32 4
+; MAX-BW-NEXT:    [[TMP101:%.*]] = insertelement <8 x i32> [[TMP100]], i32 [[TMP93]], i32 5
+; MAX-BW-NEXT:    [[TMP102:%.*]] = insertelement <8 x i32> [[TMP101]], i32 [[TMP94]], i32 6
+; MAX-BW-NEXT:    [[TMP103:%.*]] = insertelement <8 x i32> [[TMP102]], i32 [[TMP95]], i32 7
+; MAX-BW-NEXT:    [[TMP112:%.*]] = insertelement <8 x i32> poison, i32 [[TMP104]], i32 0
+; MAX-BW-NEXT:    [[TMP113:%.*]] = insertelement <8 x i32> [[TMP112]], i32 [[TMP105]], i32 1
+; MAX-BW-NEXT:    [[TMP114:%.*]] = insertelement <8 x i32> [[TMP113]], i32 [[TMP106]], i32 2
+; MAX-BW-NEXT:    [[TMP115:%.*]] = insertelement <8 x i32> [[TMP114]], i32 [[TMP107]], i32 3
+; MAX-BW-NEXT:    [[TMP116:%.*]] = insertelement <8 x i32> [[TMP115]], i32 [[TMP108]], i32 4
+; MAX-BW-NEXT:    [[TMP117:%.*]] = insertelement <8 x i32> [[TMP116]], i32 [[TMP109]], i32 5
+; MAX-BW-NEXT:    [[TMP118:%.*]] = insertelement <8 x i32> [[TMP117]], i32 [[TMP110]], i32 6
+; MAX-BW-NEXT:    [[TMP119:%.*]] = insertelement <8 x i32> [[TMP118]], i32 [[TMP111]], i32 7
 ; MAX-BW-NEXT:    [[TMP128:%.*]] = insertelement <8 x i32> poison, i32 [[TMP120]], i32 0
 ; MAX-BW-NEXT:    [[TMP129:%.*]] = insertelement <8 x i32> [[TMP128]], i32 [[TMP121]], i32 1
 ; MAX-BW-NEXT:    [[TMP130:%.*]] = insertelement <8 x i32> [[TMP129]], i32 [[TMP122]], i32 2
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 7684d274a75cf..fe4f3527dfca8 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -1486,14 +1486,14 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) {
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = load double, ptr [[TMP9]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = load double, ptr [[TMP10]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = load double, ptr [[TMP11]], align 8
-; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = insertelement <4 x double> poison, double [[TMP16]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = insertelement <4 x double> [[TMP20]], double [[TMP17]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = insertelement <4 x double> [[TMP21]], double [[TMP18]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = insertelement <4 x double> [[TMP22]], double [[TMP19]], i32 3
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = load double, ptr [[TMP12]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = load double, ptr [[TMP13]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = load double, ptr [[TMP14]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = load double, ptr [[TMP15]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = insertelement <4 x double> poison, double [[TMP16]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP46:%.*]] = insertelement <4 x double> [[TMP45]], double [[TMP17]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP47:%.*]] = insertelement <4 x double> [[TMP46]], double [[TMP18]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = insertelement <4 x double> [[TMP47]], double [[TMP19]], i32 3
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = insertelement <4 x double> poison, double [[TMP24]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = insertelement <4 x double> [[TMP28]], double [[TMP25]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = insertelement <4 x double> [[TMP29]], double [[TMP26]], i32 2
@@ -1943,14 +1943,14 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = load i16, ptr [[TMP10]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = load i16, ptr [[TMP11]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = load i16, ptr [[TMP12]], align 2
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = insertelement <4 x i16> poison, i16 [[TMP19]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = insertelement <4 x i16> [[TMP23]], i16 [[TMP20]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP21]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> [[TMP25]], i16 [[TMP22]], i32 3
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = load i16, ptr [[TMP13]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = load i16, ptr [[TMP14]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = load i16, ptr [[TMP15]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = load i16, ptr [[TMP16]], align 2
+; UNROLL-NO-IC-NEXT:    [[TMP49:%.*]] = insertelement <4 x i16> poison, i16 [[TMP19]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP50:%.*]] = insertelement <4 x i16> [[TMP49]], i16 [[TMP20]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP51:%.*]] = insertelement <4 x i16> [[TMP50]], i16 [[TMP21]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> [[TMP51]], i16 [[TMP22]], i32 3
 ; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = insertelement <4 x i16> poison, i16 [[TMP27]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = insertelement <4 x i16> [[TMP31]], i16 [[TMP28]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = insertelement <4 x i16> [[TMP32]], i16 [[TMP29]], i32 2
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 5a5b06de69552..1ec42d2240485 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1002,10 +1002,10 @@ define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]]
 ; UNROLL-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4
 ; UNROLL-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
-; UNROLL-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0
-; UNROLL-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP11]], i64 1
 ; UNROLL-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP8]], align 4
 ; UNROLL-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP9]], align 4
+; UNROLL-NEXT:    [[TMP24:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0
+; UNROLL-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP24]], float [[TMP11]], i64 1
 ; UNROLL-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> poison, float [[TMP14]], i64 0
 ; UNROLL-NEXT:    [[TMP17:%.*]] = insertelement <2 x float> [[TMP16]], float [[TMP15]], i64 1
 ; UNROLL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
@@ -1014,10 +1014,10 @@ define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]]
 ; UNROLL-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP18]], align 4
 ; UNROLL-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP19]], align 4
-; UNROLL-NEXT:    [[TMP24:%.*]] = insertelement <2 x float> poison, float [[TMP22]], i64 0
-; UNROLL-NEXT:    [[TMP25:%.*]] = insertelement <2 x float> [[TMP24]], float [[TMP23]], i64 1
 ; UNROLL-NEXT:    [[TMP26:%.*]] = load float, ptr [[TMP20]], align 4
 ; UNROLL-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP21]], align 4
+; UNROLL-NEXT:    [[TMP45:%.*]] = insertelement <2 x float> poison, float [[TMP22]], i64 0
+; UNROLL-NEXT:    [[TMP25:%.*]] = insertelement <2 x float> [[TMP45]], float [[TMP23]], i64 1
 ; UNROLL-NEXT:    [[TMP28:%.*]] = insertelement <2 x float> poison, float [[TMP26]], i64 0
 ; UNROLL-NEXT:    [[TMP29:%.*]] = insertelement <2 x float> [[TMP28]], float [[TMP27]], i64 1
 ; UNROLL-NEXT:    [[TMP30:%.*]] = fadd fast <2 x float> [[VEC_PHI]], splat (float 1.000000e+00)
@@ -1083,10 +1083,10 @@ define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP8]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP12]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP9]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP10]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> [[TMP25]], float [[TMP12]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP16]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP3]]
@@ -1095,10 +1095,10 @@ define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]]
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP19]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP20]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = insertelement <2 x float> poison, float [[TMP23]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = insertelement <2 x float> [[TMP25]], float [[TMP24]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP21]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = load float, ptr [[TMP22]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP46:%.*]] = insertelement <2 x float> poison, float [[TMP23]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = insertelement <2 x float> [[TMP46]], float [[TMP24]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP28]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = fadd fast <2 x float> [[VEC_PHI]], splat (float 1.000000e+00)
@@ -1345,10 +1345,10 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1
 ; UNROLL-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 8
 ; UNROLL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 8
-; UNROLL-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i64 0
-; UNROLL-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i64 1
 ; UNROLL-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP5]], align 8
 ; UNROLL-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP6]], align 8
+; UNROLL-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i64 0
+; UNROLL-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP8]], i64 1
 ; UNROLL-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
 ; UNROLL-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
 ; UNROLL-NEXT:    [[TMP15:%.*]] = xor <2 x i32> [[TMP10]], [[BROADCAST_SPLAT]]
@@ -1405,10 +1405,10 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 8
-; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP9]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP6]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP7]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[TMP9]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP13]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = xor <2 x i32> [[TMP11]], [[BROADCAST_SPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll
index f948906d4d0b8..59debc5f7904b 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll
@@ -79,8 +79,9 @@ define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly
 ; CHECK-NEXT:      vp<[[IN_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
 ; CHECK-NEXT:      WIDEN ir<%in_val> = load vp<[[IN_VEC_PTR]]>
 ; CHECK-NEXT:      REPLICATE ir<%call> = call @foo(ir<%in_val>)
-; CHECK-NEXT:      WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
-; CHECK-NEXT:      WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+; CHECK-NEXT:      EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
+; CHECK-NEXT:      WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
+; CHECK-NEXT:      WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ; CHECK-NEXT:      CLONE ir<%arrayidx2> = getelementptr inbounds ir<%out_a>, vp<[[STEPS]]>
 ; CHECK-NEXT:      vp<[[OUT_A_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx2>
 ; CHECK-NEXT:      WIDEN store vp<[[OUT_A_VEC_PTR]]>, ir<%extract_a>

>From af03b31f1e5c21c3382d3264e984cb10a5f749b5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 21 Jun 2025 15:36:26 +0100
Subject: [PATCH 2/4] [VPlan] Explicitly Unpack.

---
 llvm/lib/Transforms/Vectorize/VPlan.h         |   1 +
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   4 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  29 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  21 +
 .../epilog-vectorization-widen-inductions.ll  |   8 +-
 .../AArch64/struct-return-cost.ll             |  33 +-
 .../AArch64/synthesize-mask-for-call.ll       |   3 +-
 ...nterleave-to-widen-memory-with-wide-ops.ll |  16 +-
 .../AArch64/type-shrinkage-insertelt.ll       |  96 +-
 .../widen-call-with-intrinsic-or-libfunc.ll   |  10 +-
 .../AArch64/wider-VF-for-callinst.ll          |   2 +-
 .../LoopVectorize/ARM/mve-saddsatcost.ll      |   2 +-
 .../RISCV/riscv-vector-reverse-output.ll      |  46 +-
 .../X86/fixed-order-recurrence.ll             |   8 +-
 .../LoopVectorize/X86/gather_scatter.ll       |  18 +-
 .../LoopVectorize/X86/induction-costs.ll      |  30 +-
 ...terleave-ptradd-with-replicated-operand.ll |  34 +-
 ...leaved-accesses-hoist-load-across-store.ll |  72 +-
 .../LoopVectorize/X86/parallel-loops.ll       |  16 +-
 ...licate-recipe-with-only-first-lane-used.ll |  14 +-
 .../LoopVectorize/X86/strided_load_cost.ll    |  78 +-
 .../X86/vplan-native-inner-loop-only.ll       |   6 +-
 .../x86-interleaved-accesses-masked-group.ll  |  14 +-
 ...86-interleaved-store-accesses-with-gaps.ll |  28 +-
 llvm/test/Transforms/LoopVectorize/assume.ll  |   6 +-
 .../Transforms/LoopVectorize/bsd_regex.ll     |   6 +-
 ...able-info-from-assumption-constant-size.ll |  60 +-
 .../LoopVectorize/forked-pointers.ll          |   6 +-
 .../Transforms/LoopVectorize/induction.ll     | 134 +--
 .../interleaved-accesses-pred-stores.ll       |   2 +-
 .../LoopVectorize/interleaved-accesses.ll     |  64 +-
 .../LoopVectorize/iv_outside_user.ll          |   2 +-
 .../LoopVectorize/load-deref-pred-align.ll    |   4 +-
 .../Transforms/LoopVectorize/loop-scalars.ll  |   2 +-
 .../optimal-epilog-vectorization.ll           |  16 +-
 llvm/test/Transforms/LoopVectorize/optsize.ll |  10 +-
 ...ction-index-width-smaller-than-iv-width.ll |   8 +-
 .../pr39417-optsize-scevchecks.ll             |   6 +-
 .../reduction-with-invariant-store.ll         |   6 +-
 .../LoopVectorize/scev-predicate-reasoning.ll |  18 +-
 .../LoopVectorize/single-value-blend-phis.ll  |   2 +-
 .../uniform-args-call-variants.ll             |   8 +-
 .../Transforms/LoopVectorize/uniform-blend.ll |   6 +-
 .../uniform_across_vf_induction1.ll           | 240 ++---
 .../uniform_across_vf_induction1_and.ll       | 136 +--
 .../uniform_across_vf_induction1_div_urem.ll  | 132 +--
 .../uniform_across_vf_induction1_lshr.ll      | 440 ++++-----
 .../uniform_across_vf_induction2.ll           | 928 +++++++++---------
 .../version-stride-with-integer-casts.ll      |   6 +-
 ...ive-path-inner-loop-with-runtime-checks.ll |  20 +-
 .../LoopVectorize/vplan-printing.ll           |   4 +-
 .../vplan-widen-struct-return.ll              |   3 +-
 52 files changed, 1454 insertions(+), 1410 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1570fbc4eac73..e0201ff334ce1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -971,6 +971,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
     StepVector,
 
     Pack,
+    Unpack,
 
   };
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 9004a09be5726..c1c2ca7d8e639 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -101,6 +101,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
     return Type::getIntNTy(Ctx, 32);
   case Instruction::PHI:
   case VPInstruction::Pack:
+  case VPInstruction::Unpack:
     // Infer the type of first operand only, as other operands of header phi's
     // may lead to infinite recursion.
     return inferScalarType(R->getOperand(0));
@@ -442,7 +443,8 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
       break;
     for (VPRecipeBase &R : *VPBB) {
       if (isa<VPInstruction>(&R) &&
-          cast<VPInstruction>(&R)->getOpcode() == VPInstruction::Pack)
+          (cast<VPInstruction>(&R)->getOpcode() == VPInstruction::Pack ||
+           cast<VPInstruction>(&R)->getOpcode() == VPInstruction::Unpack))
         continue;
       Idx2Recipe.push_back(&R);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c9c78e00de3f8..1e8cf8c912901 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -418,7 +418,8 @@ VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
 }
 
 bool VPInstruction::doesGeneratePerAllLanes() const {
-  return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
+  return (Opcode == VPInstruction::PtrAdd || Opcode == VPInstruction::Unpack) &&
+         !vputils::onlyFirstLaneUsed(this);
 }
 
 bool VPInstruction::canGenerateScalarForFirstLane() const {
@@ -438,6 +439,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
   case VPInstruction::AnyOf:
+  case VPInstruction::Unpack:
     return true;
   default:
     return false;
@@ -448,10 +450,17 @@ Value *VPInstruction::generatePerLane(VPTransformState &State,
                                       const VPLane &Lane) {
   IRBuilderBase &Builder = State.Builder;
 
-  assert(getOpcode() == VPInstruction::PtrAdd &&
-         "only PtrAdd opcodes are supported for now");
-  return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
-                              State.get(getOperand(1), Lane), Name);
+  switch (getOpcode()) {
+  case VPInstruction::PtrAdd:
+    return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
+                                State.get(getOperand(1), Lane), Name);
+
+  case VPInstruction::Unpack: {
+    Value *LaneV = Lane.getAsRuntimeExpr(State.Builder, State.VF);
+    return Builder.CreateExtractElement(State.get(getOperand(0)), LaneV);
+  }
+  }
+  llvm_unreachable("all supported opcodes must be handled above");
 }
 
 /// Create a conditional branch using \p Cond branching to the successors of \p
@@ -775,6 +784,11 @@ Value *VPInstruction::generate(VPTransformState &State) {
           State.packScalarIntoVectorizedValue(getOperand(0), WideValue, Lane);
     return WideValue;
   }
+  case VPInstruction::Unpack: {
+    assert(vputils::onlyFirstLaneUsed(this) &&
+           "can only generate first lane for PtrAdd");
+    return generatePerLane(State, VPLane(0));
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -934,6 +948,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case VPInstruction::StepVector:
   case VPInstruction::ReductionStartVector:
   case VPInstruction::Pack:
+  case VPInstruction::Unpack:
     return false;
   default:
     return true;
@@ -1077,6 +1092,10 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::Pack:
     O << "pack-into-vector";
     break;
+  case VPInstruction::Unpack:
+    O << "unpack-into-scalars";
+    break;
+
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 4b28be528d3ae..e48ac81f8857c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1905,6 +1905,27 @@ static void materializePack(VPlan &Plan) {
             cast<VPInstruction>(&R)->doesGeneratePerAllLanes()))
         continue;
       auto *Def = cast<VPSingleDefRecipe>(&R);
+      for (auto *Op : to_vector(Def->operands())) {
+        VPRecipeBase *OpDef = Op->getDefiningRecipe();
+        if (!OpDef || isa<VPReplicateRecipe>(OpDef) ||
+            vputils::isSingleScalar(Op) ||
+            (isa<VPInstruction>(OpDef) &&
+             cast<VPInstruction>(OpDef)->doesGeneratePerAllLanes()) ||
+            isa<VPScalarIVStepsRecipe>(OpDef))
+          continue;
+
+        auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Op});
+        if (OpDef->isPhi())
+          Unpack->insertBefore(*OpDef->getParent(),
+                               OpDef->getParent()->getFirstNonPhi());
+        else
+          Unpack->insertAfter(OpDef);
+        Op->replaceUsesWithIf(Unpack, [](VPUser &U, unsigned) {
+          auto *RepR = dyn_cast<VPReplicateRecipe>(&U);
+          return RepR && (!isa<StoreInst>(RepR->getUnderlyingInstr()) ||
+                          !vputils::isSingleScalar(RepR->getOperand(1)));
+        });
+      }
       if (all_of(Def->users(),
                  [Def](VPUser *U) { return U->usesScalars(Def); }))
         continue;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
index 0dd806f1350e2..79a2cdce4086b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
@@ -29,12 +29,12 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <2 x ptr> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <2 x ptr> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP10]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP10]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP12]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2
@@ -62,8 +62,8 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP8]], i32 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <2 x ptr> [[TMP20]], zeroinitializer
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[TMP21]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP22]])
 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP21]], i32 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP22]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP23]])
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP7]], i32 0
 ; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP24]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
index 49b4080da3434..ea8fa88573ac4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
@@ -15,12 +15,14 @@ target triple = "aarch64--linux-gnu"
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
 ;
-; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
-; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
@@ -66,17 +68,20 @@ exit:
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_a = extractvalue { half, half } %call, 0
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_b = extractvalue { half, half } %call, 1
 ;
-; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 2: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF 2: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
-; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
-; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
@@ -123,32 +128,38 @@ exit:
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_a = extractvalue { half, half } %call, 0
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_b = extractvalue { half, half } %call, 1
 ;
-; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 2: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF 2: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
-; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF 4: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
-; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF 8: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
-; CHECK-COST: Cost of Invalid for VF vscale x 1: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF vscale x 1: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of Invalid for VF vscale x 1: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF vscale x 1: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
-; CHECK-COST: Cost of Invalid for VF vscale x 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF vscale x 2: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of Invalid for VF vscale x 2: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF vscale x 2: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
 ;
-; CHECK-COST: Cost of Invalid for VF vscale x 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF vscale x 4: EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-COST: Cost of Invalid for VF vscale x 4: REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-COST: Cost of 0 for VF vscale x 4: EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
index 7d7fb8088c6fe..8527f46f740e4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
@@ -28,7 +28,8 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:     CLONE ir<%gep> = getelementptr ir<%b>, vp<[[STEPS]]>
 ; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
 ; CHECK-NEXT:     WIDEN ir<%load> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT:     REPLICATE ir<%call> = call @foo(ir<%load>)
+; CHECK-NEXT:     EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%load>
+; CHECK-NEXT:     REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-NEXT:     EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
 ; CHECK-NEXT:     vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
index 813d61b52100f..35441a38a6fee 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
@@ -768,15 +768,15 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) {
 ; VF2-NEXT:    [[TMP22:%.*]] = shufflevector <6 x i32> [[WIDE_VEC1]], <6 x i32> poison, <2 x i32> <i32 1, i32 4>
 ; VF2-NEXT:    [[TMP14:%.*]] = mul <2 x i32> [[TMP7]], [[TMP13]]
 ; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
-; VF2-NEXT:    store i32 [[TMP15]], ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
+; VF2-NEXT:    store i32 [[TMP15]], ptr [[TMP8]], align 8
 ; VF2-NEXT:    store i32 [[TMP16]], ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
 ; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
 ; VF2-NEXT:    [[TMP23:%.*]] = mul <2 x i32> [[TMP7]], [[TMP22]]
 ; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0
-; VF2-NEXT:    store i32 [[TMP24]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1
+; VF2-NEXT:    store i32 [[TMP24]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    store i32 [[TMP25]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 98
@@ -809,12 +809,12 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) {
 ; VF4-NEXT:    [[TMP44:%.*]] = shufflevector <12 x i32> [[WIDE_VEC1]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; VF4-NEXT:    [[TMP28:%.*]] = mul <4 x i32> [[TMP15]], [[TMP27]]
 ; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP28]], i32 0
-; VF4-NEXT:    store i32 [[TMP29]], ptr [[TMP16]], align 8
 ; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP28]], i32 1
-; VF4-NEXT:    store i32 [[TMP30]], ptr [[TMP17]], align 8
 ; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP28]], i32 2
-; VF4-NEXT:    store i32 [[TMP31]], ptr [[TMP18]], align 8
 ; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP28]], i32 3
+; VF4-NEXT:    store i32 [[TMP29]], ptr [[TMP16]], align 8
+; VF4-NEXT:    store i32 [[TMP30]], ptr [[TMP17]], align 8
+; VF4-NEXT:    store i32 [[TMP31]], ptr [[TMP18]], align 8
 ; VF4-NEXT:    store i32 [[TMP32]], ptr [[TMP19]], align 8
 ; VF4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
 ; VF4-NEXT:    [[TMP34:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
@@ -822,12 +822,12 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) {
 ; VF4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 1
 ; VF4-NEXT:    [[TMP45:%.*]] = mul <4 x i32> [[TMP15]], [[TMP44]]
 ; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
-; VF4-NEXT:    store i32 [[TMP46]], ptr [[TMP33]], align 8
 ; VF4-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
-; VF4-NEXT:    store i32 [[TMP47]], ptr [[TMP34]], align 8
 ; VF4-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
-; VF4-NEXT:    store i32 [[TMP48]], ptr [[TMP35]], align 8
 ; VF4-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
+; VF4-NEXT:    store i32 [[TMP46]], ptr [[TMP33]], align 8
+; VF4-NEXT:    store i32 [[TMP47]], ptr [[TMP34]], align 8
+; VF4-NEXT:    store i32 [[TMP48]], ptr [[TMP35]], align 8
 ; VF4-NEXT:    store i32 [[TMP49]], ptr [[TMP36]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
index c017c1f3b36ee..ae1f39982d565 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
@@ -21,30 +21,30 @@ define void @test0(ptr noalias %M3, ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[WIDE_LOAD]], splat (i16 10)
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = ashr exact i64 [[TMP11]], 32
-; CHECK-NEXT:    [[TMP16:%.*]] = ashr exact i64 [[TMP12]], 32
-; CHECK-NEXT:    [[TMP17:%.*]] = ashr exact i64 [[TMP13]], 32
-; CHECK-NEXT:    [[TMP18:%.*]] = ashr exact i64 [[TMP14]], 32
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP6]], i32 0
-; CHECK-NEXT:    store i16 [[TMP23]], ptr [[TMP19]], align 2
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP6]], i32 1
-; CHECK-NEXT:    store i16 [[TMP24]], ptr [[TMP20]], align 2
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP6]], i32 2
-; CHECK-NEXT:    store i16 [[TMP25]], ptr [[TMP21]], align 2
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[TMP6]], i32 3
-; CHECK-NEXT:    store i16 [[TMP26]], ptr [[TMP22]], align 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP6]], i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = ashr exact i64 [[TMP15]], 32
+; CHECK-NEXT:    [[TMP20:%.*]] = ashr exact i64 [[TMP16]], 32
+; CHECK-NEXT:    [[TMP21:%.*]] = ashr exact i64 [[TMP17]], 32
+; CHECK-NEXT:    [[TMP22:%.*]] = ashr exact i64 [[TMP18]], 32
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP22]]
+; CHECK-NEXT:    store i16 [[TMP7]], ptr [[TMP23]], align 2
+; CHECK-NEXT:    store i16 [[TMP8]], ptr [[TMP24]], align 2
+; CHECK-NEXT:    store i16 [[TMP9]], ptr [[TMP25]], align 2
+; CHECK-NEXT:    store i16 [[TMP10]], ptr [[TMP26]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -115,30 +115,30 @@ define void @test1(ptr noalias %M3, ptr noalias %A, ptr noalias %B, ptr noalias
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i16> [[WIDE_LOAD]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP17:%.*]] = ashr exact i64 [[TMP13]], 32
-; CHECK-NEXT:    [[TMP18:%.*]] = ashr exact i64 [[TMP14]], 32
-; CHECK-NEXT:    [[TMP19:%.*]] = ashr exact i64 [[TMP15]], 32
-; CHECK-NEXT:    [[TMP20:%.*]] = ashr exact i64 [[TMP16]], 32
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP8]], i32 0
-; CHECK-NEXT:    store i16 [[TMP25]], ptr [[TMP21]], align 2
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[TMP8]], i32 1
-; CHECK-NEXT:    store i16 [[TMP26]], ptr [[TMP22]], align 2
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2
-; CHECK-NEXT:    store i16 [[TMP27]], ptr [[TMP23]], align 2
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3
-; CHECK-NEXT:    store i16 [[TMP28]], ptr [[TMP24]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = ashr exact i64 [[TMP17]], 32
+; CHECK-NEXT:    [[TMP22:%.*]] = ashr exact i64 [[TMP18]], 32
+; CHECK-NEXT:    [[TMP23:%.*]] = ashr exact i64 [[TMP19]], 32
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr exact i64 [[TMP20]], 32
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP24]]
+; CHECK-NEXT:    store i16 [[TMP9]], ptr [[TMP25]], align 2
+; CHECK-NEXT:    store i16 [[TMP10]], ptr [[TMP26]], align 2
+; CHECK-NEXT:    store i16 [[TMP11]], ptr [[TMP27]], align 2
+; CHECK-NEXT:    store i16 [[TMP12]], ptr [[TMP28]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
index 1f2e91884a5d9..96fc4144f4a89 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
@@ -28,8 +28,9 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     WIDEN-CAST ir<%conv> = fpext ir<%l> to double
 ; CHECK-NEXT:     WIDEN-CALL ir<%s> = call fast @llvm.sin.f64(ir<%conv>) (using library function: __simd_sin_v2f64)
+; CHECK-NEXT:     EMIT vp<[[SCALARS:%.+]]> = unpack-into-scalars ir<%s>
 ; CHECK-NEXT:     REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]>
-; CHECK-NEXT:     REPLICATE store ir<%s>, ir<%gep.dst>
+; CHECK-NEXT:     REPLICATE store vp<[[SCALARS]]>, ir<%gep.dst>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:   No successors
@@ -75,8 +76,9 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     WIDEN-CAST ir<%conv> = fpext ir<%l> to double
 ; CHECK-NEXT:     WIDEN-INTRINSIC ir<%s> = call fast llvm.sin(ir<%conv>)
+; CHECK-NEXT:     EMIT vp<[[SCALARS:%.+]]> = unpack-into-scalars ir<%s>
 ; CHECK-NEXT:     REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]>
-; CHECK-NEXT:     REPLICATE store ir<%s>, ir<%gep.dst>
+; CHECK-NEXT:     REPLICATE store vp<[[SCALARS]]>, ir<%gep.dst>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:   No successors
@@ -113,11 +115,11 @@ define void @test(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call fast <2 x double> @__simd_sin_v2f64(<2 x double> [[TMP4]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
 ; CHECK-NEXT:    store double [[TMP8]], ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
 ; CHECK-NEXT:    store double [[TMP9]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
index 3545c6b2239d7..7015b3c0fbc53 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
@@ -57,8 +57,8 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #1 {
 ; NARROW-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; NARROW-NEXT:    [[TMP1:%.*]] = fptrunc <2 x double> [[WIDE_LOAD]] to <2 x float>
 ; NARROW-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
-; NARROW-NEXT:    [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]]
 ; NARROW-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; NARROW-NEXT:    [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]]
 ; NARROW-NEXT:    [[TMP5:%.*]] = call float @foo(float [[TMP4]]) #[[ATTR1]]
 ; NARROW-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
 ; NARROW-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP5]], i32 1
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
index 04a97f451770a..a0c48e312d727 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
@@ -8,7 +8,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 
 ; CHECK-COST-LABEL: arm_offset_q15
 ; CHECK-COST: LV: Found an estimated cost of 2 for VF 1 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Cost of 36 for VF 2: REPLICATE ir<%1> = call @llvm.sadd.sat.i16(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 36 for VF 2: REPLICATE ir<%1> = call @llvm.sadd.sat.i16(vp<{{.+}}>, ir<%offset>)
 ; CHECK-COST: Cost of 8 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
 ; CHECK-COST: Cost of 2 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index 758101d3e886e..291d80bfe94ff 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -460,17 +460,17 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
 ; RV64-NEXT:    [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
 ; RV64-NEXT:    [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
 ; RV64-NEXT:    [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
+; RV64-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
+; RV64-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
+; RV64-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
+; RV64-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
 ; RV64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
 ; RV64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
 ; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
 ; RV64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
-; RV64-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
 ; RV64-NEXT:    store i7 [[TMP25]], ptr [[TMP21]], align 1
-; RV64-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
 ; RV64-NEXT:    store i7 [[TMP26]], ptr [[TMP22]], align 1
-; RV64-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
 ; RV64-NEXT:    store i7 [[TMP27]], ptr [[TMP23]], align 1
-; RV64-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
 ; RV64-NEXT:    store i7 [[TMP28]], ptr [[TMP24]], align 1
 ; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; RV64-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
@@ -523,17 +523,17 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
 ; RV32-NEXT:    [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
 ; RV32-NEXT:    [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
 ; RV32-NEXT:    [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
+; RV32-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
+; RV32-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
+; RV32-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
+; RV32-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
 ; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
 ; RV32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
 ; RV32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
 ; RV32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
-; RV32-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
 ; RV32-NEXT:    store i7 [[TMP25]], ptr [[TMP21]], align 1
-; RV32-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
 ; RV32-NEXT:    store i7 [[TMP26]], ptr [[TMP22]], align 1
-; RV32-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
 ; RV32-NEXT:    store i7 [[TMP27]], ptr [[TMP23]], align 1
-; RV32-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
 ; RV32-NEXT:    store i7 [[TMP28]], ptr [[TMP24]], align 1
 ; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; RV32-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
@@ -607,29 +607,29 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
 ; RV64-UF2-NEXT:    [[TMP39:%.*]] = insertelement <4 x i7> [[TMP38]], i7 [[TMP35]], i32 3
 ; RV64-UF2-NEXT:    [[TMP40:%.*]] = add <4 x i7> [[TMP31]], splat (i7 1)
 ; RV64-UF2-NEXT:    [[TMP41:%.*]] = add <4 x i7> [[TMP39]], splat (i7 1)
-; RV64-UF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP1]]
-; RV64-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP2]]
-; RV64-UF2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP51]]
-; RV64-UF2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP11]]
-; RV64-UF2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP59]]
-; RV64-UF2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP13]]
-; RV64-UF2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP15]]
 ; RV64-UF2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0
-; RV64-UF2-NEXT:    store i7 [[TMP7]], ptr [[TMP9]], align 1
 ; RV64-UF2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1
-; RV64-UF2-NEXT:    store i7 [[TMP8]], ptr [[TMP10]], align 1
 ; RV64-UF2-NEXT:    [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2
-; RV64-UF2-NEXT:    store i7 [[TMP52]], ptr [[TMP44]], align 1
 ; RV64-UF2-NEXT:    [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3
-; RV64-UF2-NEXT:    store i7 [[TMP53]], ptr [[TMP45]], align 1
 ; RV64-UF2-NEXT:    [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0
-; RV64-UF2-NEXT:    store i7 [[TMP54]], ptr [[TMP46]], align 1
 ; RV64-UF2-NEXT:    [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1
-; RV64-UF2-NEXT:    store i7 [[TMP55]], ptr [[TMP47]], align 1
 ; RV64-UF2-NEXT:    [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2
-; RV64-UF2-NEXT:    store i7 [[TMP56]], ptr [[TMP48]], align 1
 ; RV64-UF2-NEXT:    [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3
+; RV64-UF2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP1]]
+; RV64-UF2-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP2]]
+; RV64-UF2-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP51]]
+; RV64-UF2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP11]]
+; RV64-UF2-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP59]]
+; RV64-UF2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP13]]
+; RV64-UF2-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP15]]
+; RV64-UF2-NEXT:    store i7 [[TMP7]], ptr [[TMP63]], align 1
+; RV64-UF2-NEXT:    store i7 [[TMP8]], ptr [[TMP64]], align 1
+; RV64-UF2-NEXT:    store i7 [[TMP52]], ptr [[TMP65]], align 1
+; RV64-UF2-NEXT:    store i7 [[TMP53]], ptr [[TMP66]], align 1
+; RV64-UF2-NEXT:    store i7 [[TMP54]], ptr [[TMP67]], align 1
+; RV64-UF2-NEXT:    store i7 [[TMP55]], ptr [[TMP68]], align 1
+; RV64-UF2-NEXT:    store i7 [[TMP56]], ptr [[TMP69]], align 1
 ; RV64-UF2-NEXT:    store i7 [[TMP57]], ptr [[TMP49]], align 1
 ; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; RV64-UF2-NEXT:    [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index c09beedf76ee6..3a0c15ebd05e7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -389,17 +389,17 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x ptr> [[TMP26]], ptr [[TMP23]], i32 2
 ; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x ptr> [[TMP27]], ptr [[TMP24]], i32 3
 ; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x ptr> [[TMP20]], <4 x ptr> [[TMP28]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 0
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 1
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 2
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 3
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x float>, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x float> [[STRIDED_VEC]], i32 3
 ; CHECK-NEXT:    store float [[TMP30]], ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 0
 ; CHECK-NEXT:    [[TMP38:%.*]] = load float, ptr [[TMP31]], align 4
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 1
 ; CHECK-NEXT:    [[TMP32:%.*]] = load float, ptr [[TMP33]], align 4
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 2
 ; CHECK-NEXT:    [[TMP34:%.*]] = load float, ptr [[TMP35]], align 4
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 3
 ; CHECK-NEXT:    [[TMP36:%.*]] = load float, ptr [[TMP37]], align 4
 ; CHECK-NEXT:    store float [[TMP36]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index c4fc60908c7e0..57fda151f4739 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -687,16 +687,16 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[OFFSET_IDX21]]
 ; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[IDXPROM]]
 ; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META17:![0-9]+]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD17]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD16:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META8]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD16]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
 ; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr float, ptr [[TMP28]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD18:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META24:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META15]]
 ; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, <8 x ptr> [[TMP26]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD18]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20]], !noalias [[META22]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD17]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
 ; AVX512-NEXT:    [[INDEX_NEXT24]] = add nuw i64 [[INDEX18]], 8
 ; AVX512-NEXT:    [[PTR_IND20]] = getelementptr i8, ptr [[POINTER_PHI19]], i64 512
 ; AVX512-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC10]]
-; AVX512-NEXT:    br i1 [[TMP33]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP33]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; AVX512:       vec.epilog.middle.block:
 ; AVX512-NEXT:    [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]]
 ; AVX512-NEXT:    br i1 [[CMP_N17]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -716,7 +716,7 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 1
 ; AVX512-NEXT:    [[ADD_PTR6]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 16
 ; AVX512-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[ADD_PTR]]
-; AVX512-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; AVX512-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; AVX512:       for.end:
 ; AVX512-NEXT:    ret void
 ;
@@ -778,16 +778,16 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; FVW2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 0
 ; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP22]], align 4, !alias.scope [[META8:![0-9]+]]
 ; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
-; FVW2-NEXT:    store float [[TMP23]], ptr [[TMP19]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
 ; FVW2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
+; FVW2-NEXT:    store float [[TMP23]], ptr [[TMP19]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
 ; FVW2-NEXT:    store float [[TMP24]], ptr [[TMP20]], align 4, !alias.scope [[META11]], !noalias [[META13]]
 ; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr float, ptr [[TMP16]], i32 0
 ; FVW2-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x float>, ptr [[TMP25]], align 4, !alias.scope [[META15:![0-9]+]]
+; FVW2-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 0
+; FVW2-NEXT:    [[TMP29:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 1
 ; FVW2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 1
 ; FVW2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 1
-; FVW2-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 0
 ; FVW2-NEXT:    store float [[TMP28]], ptr [[TMP26]], align 4, !alias.scope [[META11]], !noalias [[META13]]
-; FVW2-NEXT:    [[TMP29:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 1
 ; FVW2-NEXT:    store float [[TMP29]], ptr [[TMP27]], align 4, !alias.scope [[META11]], !noalias [[META13]]
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; FVW2-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index 0a85548f8750b..2b08eb27d89b1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -303,36 +303,36 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 {
 ; CHECK-NEXT:    [[TMP24:%.*]] = lshr <16 x i32> [[TMP23]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP25:%.*]] = trunc <16 x i32> [[TMP24]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i8> [[TMP25]], i32 0
-; CHECK-NEXT:    store i8 [[TMP26]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META17:![0-9]+]], !noalias [[META14]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i8> [[TMP25]], i32 1
-; CHECK-NEXT:    store i8 [[TMP27]], ptr [[NEXT_GEP7]], align 1, !alias.scope [[META17]], !noalias [[META14]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i8> [[TMP25]], i32 2
-; CHECK-NEXT:    store i8 [[TMP28]], ptr [[NEXT_GEP8]], align 1, !alias.scope [[META17]], !noalias [[META14]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i8> [[TMP25]], i32 3
-; CHECK-NEXT:    store i8 [[TMP29]], ptr [[NEXT_GEP9]], align 1, !alias.scope [[META17]], !noalias [[META14]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i8> [[TMP25]], i32 4
-; CHECK-NEXT:    store i8 [[TMP30]], ptr [[NEXT_GEP10]], align 1, !alias.scope [[META17]], !noalias [[META14]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i8> [[TMP25]], i32 5
-; CHECK-NEXT:    store i8 [[TMP31]], ptr [[NEXT_GEP11]], align 1, !alias.scope [[META17]], !noalias [[META14]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i8> [[TMP25]], i32 6
-; CHECK-NEXT:    store i8 [[TMP32]], ptr [[NEXT_GEP12]], align 1, !alias.scope [[META17]], !noalias [[META14]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x i8> [[TMP25]], i32 7
-; CHECK-NEXT:    store i8 [[TMP33]], ptr [[NEXT_GEP13]], align 1, !alias.scope [[META17]], !noalias [[META14]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i8> [[TMP25]], i32 8
-; CHECK-NEXT:    store i8 [[TMP34]], ptr [[NEXT_GEP14]], align 1, !alias.scope [[META17]], !noalias [[META14]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x i8> [[TMP25]], i32 9
-; CHECK-NEXT:    store i8 [[TMP35]], ptr [[NEXT_GEP15]], align 1, !alias.scope [[META17]], !noalias [[META14]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i8> [[TMP25]], i32 10
-; CHECK-NEXT:    store i8 [[TMP36]], ptr [[NEXT_GEP16]], align 1, !alias.scope [[META17]], !noalias [[META14]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i8> [[TMP25]], i32 11
-; CHECK-NEXT:    store i8 [[TMP37]], ptr [[NEXT_GEP17]], align 1, !alias.scope [[META17]], !noalias [[META14]]
 ; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i8> [[TMP25]], i32 12
-; CHECK-NEXT:    store i8 [[TMP38]], ptr [[NEXT_GEP18]], align 1, !alias.scope [[META17]], !noalias [[META14]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <16 x i8> [[TMP25]], i32 13
-; CHECK-NEXT:    store i8 [[TMP39]], ptr [[NEXT_GEP19]], align 1, !alias.scope [[META17]], !noalias [[META14]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <16 x i8> [[TMP25]], i32 14
-; CHECK-NEXT:    store i8 [[TMP40]], ptr [[NEXT_GEP20]], align 1, !alias.scope [[META17]], !noalias [[META14]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x i8> [[TMP25]], i32 15
+; CHECK-NEXT:    store i8 [[TMP26]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META17:![0-9]+]], !noalias [[META14]]
+; CHECK-NEXT:    store i8 [[TMP27]], ptr [[NEXT_GEP7]], align 1, !alias.scope [[META17]], !noalias [[META14]]
+; CHECK-NEXT:    store i8 [[TMP28]], ptr [[NEXT_GEP8]], align 1, !alias.scope [[META17]], !noalias [[META14]]
+; CHECK-NEXT:    store i8 [[TMP29]], ptr [[NEXT_GEP9]], align 1, !alias.scope [[META17]], !noalias [[META14]]
+; CHECK-NEXT:    store i8 [[TMP30]], ptr [[NEXT_GEP10]], align 1, !alias.scope [[META17]], !noalias [[META14]]
+; CHECK-NEXT:    store i8 [[TMP31]], ptr [[NEXT_GEP11]], align 1, !alias.scope [[META17]], !noalias [[META14]]
+; CHECK-NEXT:    store i8 [[TMP32]], ptr [[NEXT_GEP12]], align 1, !alias.scope [[META17]], !noalias [[META14]]
+; CHECK-NEXT:    store i8 [[TMP33]], ptr [[NEXT_GEP13]], align 1, !alias.scope [[META17]], !noalias [[META14]]
+; CHECK-NEXT:    store i8 [[TMP34]], ptr [[NEXT_GEP14]], align 1, !alias.scope [[META17]], !noalias [[META14]]
+; CHECK-NEXT:    store i8 [[TMP35]], ptr [[NEXT_GEP15]], align 1, !alias.scope [[META17]], !noalias [[META14]]
+; CHECK-NEXT:    store i8 [[TMP36]], ptr [[NEXT_GEP16]], align 1, !alias.scope [[META17]], !noalias [[META14]]
+; CHECK-NEXT:    store i8 [[TMP37]], ptr [[NEXT_GEP17]], align 1, !alias.scope [[META17]], !noalias [[META14]]
+; CHECK-NEXT:    store i8 [[TMP38]], ptr [[NEXT_GEP18]], align 1, !alias.scope [[META17]], !noalias [[META14]]
+; CHECK-NEXT:    store i8 [[TMP39]], ptr [[NEXT_GEP19]], align 1, !alias.scope [[META17]], !noalias [[META14]]
+; CHECK-NEXT:    store i8 [[TMP40]], ptr [[NEXT_GEP20]], align 1, !alias.scope [[META17]], !noalias [[META14]]
 ; CHECK-NEXT:    store i8 [[TMP41]], ptr [[NEXT_GEP21]], align 1, !alias.scope [[META17]], !noalias [[META14]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967184
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
index cdc7839bfc0f0..4b6fd1338c32c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
@@ -85,36 +85,36 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 {
 ; CHECK-NEXT:    [[TMP38:%.*]] = add <4 x i32> [[STRIDED_VEC23]], [[STRIDED_VEC22]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = add <4 x i32> [[STRIDED_VEC26]], [[STRIDED_VEC25]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[TMP36]], i32 0
-; CHECK-NEXT:    store i32 [[TMP40]], ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i32> [[TMP36]], i32 1
-; CHECK-NEXT:    store i32 [[TMP41]], ptr [[NEXT_GEP2]], align 4
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[TMP36]], i32 2
-; CHECK-NEXT:    store i32 [[TMP42]], ptr [[NEXT_GEP3]], align 4
 ; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[TMP36]], i32 3
-; CHECK-NEXT:    store i32 [[TMP43]], ptr [[NEXT_GEP4]], align 4
 ; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x i32> [[TMP37]], i32 0
-; CHECK-NEXT:    store i32 [[TMP44]], ptr [[NEXT_GEP5]], align 4
 ; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i32> [[TMP37]], i32 1
-; CHECK-NEXT:    store i32 [[TMP45]], ptr [[NEXT_GEP6]], align 4
 ; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP37]], i32 2
-; CHECK-NEXT:    store i32 [[TMP46]], ptr [[NEXT_GEP7]], align 4
 ; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP37]], i32 3
-; CHECK-NEXT:    store i32 [[TMP47]], ptr [[NEXT_GEP8]], align 4
 ; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP38]], i32 0
-; CHECK-NEXT:    store i32 [[TMP48]], ptr [[NEXT_GEP9]], align 4
 ; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP38]], i32 1
-; CHECK-NEXT:    store i32 [[TMP49]], ptr [[NEXT_GEP10]], align 4
 ; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i32> [[TMP38]], i32 2
-; CHECK-NEXT:    store i32 [[TMP50]], ptr [[NEXT_GEP11]], align 4
 ; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <4 x i32> [[TMP38]], i32 3
-; CHECK-NEXT:    store i32 [[TMP51]], ptr [[NEXT_GEP12]], align 4
 ; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0
-; CHECK-NEXT:    store i32 [[TMP52]], ptr [[NEXT_GEP13]], align 4
 ; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1
-; CHECK-NEXT:    store i32 [[TMP53]], ptr [[NEXT_GEP14]], align 4
 ; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2
-; CHECK-NEXT:    store i32 [[TMP54]], ptr [[NEXT_GEP15]], align 4
 ; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3
+; CHECK-NEXT:    store i32 [[TMP40]], ptr [[NEXT_GEP]], align 4
+; CHECK-NEXT:    store i32 [[TMP41]], ptr [[NEXT_GEP2]], align 4
+; CHECK-NEXT:    store i32 [[TMP42]], ptr [[NEXT_GEP3]], align 4
+; CHECK-NEXT:    store i32 [[TMP43]], ptr [[NEXT_GEP4]], align 4
+; CHECK-NEXT:    store i32 [[TMP44]], ptr [[NEXT_GEP5]], align 4
+; CHECK-NEXT:    store i32 [[TMP45]], ptr [[NEXT_GEP6]], align 4
+; CHECK-NEXT:    store i32 [[TMP46]], ptr [[NEXT_GEP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP47]], ptr [[NEXT_GEP8]], align 4
+; CHECK-NEXT:    store i32 [[TMP48]], ptr [[NEXT_GEP9]], align 4
+; CHECK-NEXT:    store i32 [[TMP49]], ptr [[NEXT_GEP10]], align 4
+; CHECK-NEXT:    store i32 [[TMP50]], ptr [[NEXT_GEP11]], align 4
+; CHECK-NEXT:    store i32 [[TMP51]], ptr [[NEXT_GEP12]], align 4
+; CHECK-NEXT:    store i32 [[TMP52]], ptr [[NEXT_GEP13]], align 4
+; CHECK-NEXT:    store i32 [[TMP53]], ptr [[NEXT_GEP14]], align 4
+; CHECK-NEXT:    store i32 [[TMP54]], ptr [[NEXT_GEP15]], align 4
 ; CHECK-NEXT:    store i32 [[TMP55]], ptr [[NEXT_GEP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
@@ -123,11 +123,11 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 {
 ; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[M]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL26:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL26]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 8
 ; CHECK-NEXT:    [[P_4:%.*]] = getelementptr i8, ptr [[PTR_IV]], i64 4
 ; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[P_4]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
index 09946bfda5a7a..be1781da234f8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
@@ -28,18 +28,18 @@ define void @pr63602_1(ptr %arr) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0
-; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1
-; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2
-; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3
-; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]]
+; CHECK-NEXT:    store i32 [[TMP8]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    store i32 [[TMP9]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <12 x i32>, ptr [[TMP17]], align 4
@@ -47,13 +47,13 @@ define void @pr63602_1(ptr %arr) {
 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <12 x i32> [[WIDE_VEC2]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; CHECK-NEXT:    [[TMP18:%.*]] = add <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP18]], i32 0
-; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP18]], i32 1
-; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP18]], i32 2
-; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3
-; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -143,18 +143,18 @@ define void @pr63602_2(ptr %arr) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0
-; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1
-; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2
-; CHECK-NEXT:    store i32 [[TMP18]], ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3
-; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP9]]
+; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP18]], align 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = add nuw nsw i64 [[TMP1]], 2
 ; CHECK-NEXT:    [[TMP21:%.*]] = add nuw nsw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[TMP3]], 2
@@ -163,10 +163,10 @@ define void @pr63602_2(ptr %arr) {
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP18]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> poison, i32 [[TMP28]], i32 0
 ; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP29]], i32 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP30]], i32 2
@@ -181,13 +181,13 @@ define void @pr63602_2(ptr %arr) {
 ; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP39]], i32 3
 ; CHECK-NEXT:    [[TMP44:%.*]] = add <4 x i32> [[TMP35]], [[TMP43]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i32> [[TMP44]], i32 0
-; CHECK-NEXT:    store i32 [[TMP45]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP44]], i32 1
-; CHECK-NEXT:    store i32 [[TMP46]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP44]], i32 2
-; CHECK-NEXT:    store i32 [[TMP47]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP44]], i32 3
-; CHECK-NEXT:    store i32 [[TMP48]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    store i32 [[TMP45]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    store i32 [[TMP46]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    store i32 [[TMP47]], ptr [[TMP18]], align 4
+; CHECK-NEXT:    store i32 [[TMP48]], ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
index f1cbb4acc6aa6..879066cee931a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
@@ -73,6 +73,10 @@ define void @parallel_loop(ptr nocapture %a, ptr nocapture %b) nounwind uwtable
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 1
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 2
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4
@@ -92,14 +96,10 @@ define void @parallel_loop(ptr nocapture %a, ptr nocapture %b) nounwind uwtable
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0
-; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP17]], align 4, !llvm.access.group [[ACC_GRP1:![0-9]+]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 1
-; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP1]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 2
-; CHECK-NEXT:    store i32 [[TMP23]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP1]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3
-; CHECK-NEXT:    store i32 [[TMP24]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP1]]
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[TMP17]], align 4, !llvm.access.group [[ACC_GRP1:![0-9]+]]
+; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP1]]
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP1]]
+; CHECK-NEXT:    store i32 [[TMP23]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP1]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP27]], i64 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4, !llvm.access.group [[ACC_GRP0]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
index e5883351ae30d..51944b9329275 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
@@ -145,20 +145,20 @@ define void @replicate_udiv_with_only_first_lane_used2(i32 %x, ptr %dst, i64 %d)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP16]]
 ; CHECK-NEXT:    [[PREDPHI15:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP32]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP33]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 1
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP35]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 2
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 0
-; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 1
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 2
-; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i64> [[PREDPHI15]], i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP33]]
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP35]]
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]]
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]]
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]]
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP11]], align 2
 ; CHECK-NEXT:    store i16 0, ptr [[TMP36]], align 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
index 0a4dde0853dc3..a36ad9b854d06 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -530,6 +530,14 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; CHECK-NEXT:    [[TMP18:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i8>
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i32 1
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i32 2
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i32 3
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i32 4
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i32 5
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i32 6
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i32 7
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B:%.*]], i64 0, i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP10]]
@@ -538,21 +546,13 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i32 0
 ; CHECK-NEXT:    store i8 [[TMP28]], ptr [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i32 1
 ; CHECK-NEXT:    store i8 [[TMP29]], ptr [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i32 2
 ; CHECK-NEXT:    store i8 [[TMP30]], ptr [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i32 3
 ; CHECK-NEXT:    store i8 [[TMP31]], ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i32 4
 ; CHECK-NEXT:    store i8 [[TMP32]], ptr [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i32 5
 ; CHECK-NEXT:    store i8 [[TMP33]], ptr [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i32 6
 ; CHECK-NEXT:    store i8 [[TMP34]], ptr [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i32 7
 ; CHECK-NEXT:    store i8 [[TMP35]], ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
@@ -626,53 +626,53 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; MAX-BW-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <32 x i32> [[WIDE_VEC]], <32 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; MAX-BW-NEXT:    [[TMP34:%.*]] = add <16 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
 ; MAX-BW-NEXT:    [[TMP35:%.*]] = trunc <16 x i32> [[TMP34]] to <16 x i8>
-; MAX-BW-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B:%.*]], i64 0, i64 [[TMP16]]
-; MAX-BW-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
-; MAX-BW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP18]]
-; MAX-BW-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP19]]
-; MAX-BW-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP20]]
-; MAX-BW-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP21]]
-; MAX-BW-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP22]]
-; MAX-BW-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP23]]
-; MAX-BW-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP24]]
-; MAX-BW-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP25]]
-; MAX-BW-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP26]]
-; MAX-BW-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP27]]
-; MAX-BW-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP28]]
-; MAX-BW-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP29]]
-; MAX-BW-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP30]]
-; MAX-BW-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP31]]
 ; MAX-BW-NEXT:    [[TMP52:%.*]] = extractelement <16 x i8> [[TMP35]], i32 0
-; MAX-BW-NEXT:    store i8 [[TMP52]], ptr [[TMP36]], align 1
 ; MAX-BW-NEXT:    [[TMP53:%.*]] = extractelement <16 x i8> [[TMP35]], i32 1
-; MAX-BW-NEXT:    store i8 [[TMP53]], ptr [[TMP37]], align 1
 ; MAX-BW-NEXT:    [[TMP54:%.*]] = extractelement <16 x i8> [[TMP35]], i32 2
-; MAX-BW-NEXT:    store i8 [[TMP54]], ptr [[TMP38]], align 1
 ; MAX-BW-NEXT:    [[TMP55:%.*]] = extractelement <16 x i8> [[TMP35]], i32 3
-; MAX-BW-NEXT:    store i8 [[TMP55]], ptr [[TMP39]], align 1
 ; MAX-BW-NEXT:    [[TMP56:%.*]] = extractelement <16 x i8> [[TMP35]], i32 4
-; MAX-BW-NEXT:    store i8 [[TMP56]], ptr [[TMP40]], align 1
 ; MAX-BW-NEXT:    [[TMP57:%.*]] = extractelement <16 x i8> [[TMP35]], i32 5
-; MAX-BW-NEXT:    store i8 [[TMP57]], ptr [[TMP41]], align 1
 ; MAX-BW-NEXT:    [[TMP58:%.*]] = extractelement <16 x i8> [[TMP35]], i32 6
-; MAX-BW-NEXT:    store i8 [[TMP58]], ptr [[TMP42]], align 1
 ; MAX-BW-NEXT:    [[TMP59:%.*]] = extractelement <16 x i8> [[TMP35]], i32 7
-; MAX-BW-NEXT:    store i8 [[TMP59]], ptr [[TMP43]], align 1
 ; MAX-BW-NEXT:    [[TMP60:%.*]] = extractelement <16 x i8> [[TMP35]], i32 8
-; MAX-BW-NEXT:    store i8 [[TMP60]], ptr [[TMP44]], align 1
 ; MAX-BW-NEXT:    [[TMP61:%.*]] = extractelement <16 x i8> [[TMP35]], i32 9
-; MAX-BW-NEXT:    store i8 [[TMP61]], ptr [[TMP45]], align 1
 ; MAX-BW-NEXT:    [[TMP62:%.*]] = extractelement <16 x i8> [[TMP35]], i32 10
-; MAX-BW-NEXT:    store i8 [[TMP62]], ptr [[TMP46]], align 1
 ; MAX-BW-NEXT:    [[TMP63:%.*]] = extractelement <16 x i8> [[TMP35]], i32 11
-; MAX-BW-NEXT:    store i8 [[TMP63]], ptr [[TMP47]], align 1
 ; MAX-BW-NEXT:    [[TMP64:%.*]] = extractelement <16 x i8> [[TMP35]], i32 12
-; MAX-BW-NEXT:    store i8 [[TMP64]], ptr [[TMP48]], align 1
 ; MAX-BW-NEXT:    [[TMP65:%.*]] = extractelement <16 x i8> [[TMP35]], i32 13
-; MAX-BW-NEXT:    store i8 [[TMP65]], ptr [[TMP49]], align 1
 ; MAX-BW-NEXT:    [[TMP66:%.*]] = extractelement <16 x i8> [[TMP35]], i32 14
-; MAX-BW-NEXT:    store i8 [[TMP66]], ptr [[TMP50]], align 1
 ; MAX-BW-NEXT:    [[TMP67:%.*]] = extractelement <16 x i8> [[TMP35]], i32 15
+; MAX-BW-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B:%.*]], i64 0, i64 [[TMP16]]
+; MAX-BW-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
+; MAX-BW-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP18]]
+; MAX-BW-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP19]]
+; MAX-BW-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP20]]
+; MAX-BW-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP21]]
+; MAX-BW-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP22]]
+; MAX-BW-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP23]]
+; MAX-BW-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP24]]
+; MAX-BW-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP25]]
+; MAX-BW-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP26]]
+; MAX-BW-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP27]]
+; MAX-BW-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP28]]
+; MAX-BW-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP29]]
+; MAX-BW-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP30]]
+; MAX-BW-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP31]]
+; MAX-BW-NEXT:    store i8 [[TMP52]], ptr [[TMP69]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP53]], ptr [[TMP70]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP54]], ptr [[TMP71]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP55]], ptr [[TMP72]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP56]], ptr [[TMP73]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP57]], ptr [[TMP74]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP58]], ptr [[TMP75]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP59]], ptr [[TMP76]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP60]], ptr [[TMP77]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP61]], ptr [[TMP78]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP62]], ptr [[TMP79]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP63]], ptr [[TMP80]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP64]], ptr [[TMP81]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP65]], ptr [[TMP82]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP66]], ptr [[TMP83]], align 1
 ; MAX-BW-NEXT:    store i8 [[TMP67]], ptr [[TMP51]], align 1
 ; MAX-BW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; MAX-BW-NEXT:    [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll
index 85d6c801dee69..f11e194018f4f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll
@@ -32,12 +32,12 @@ define void @test(ptr %A) {
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[STRIDED_VEC]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP13]], i32 0
-; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i32 1
-; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP13]], i32 2
-; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP13]], i32 3
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index f29428c51c636..0a4f326608dd7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -794,20 +794,20 @@ define dso_local void @unconditional_strided1_optsize(ptr noalias nocapture read
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], splat (i32 1)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = extractelement <8 x i32> [[TMP0]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 [[TMP1]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP3]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP0]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP5]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP0]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP7]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP0]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP9]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP0]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP11]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP0]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP13]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[TMP0]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 [[TMP1]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP3]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP5]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP7]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP9]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP11]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP13]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP15]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = load i8, ptr [[TMP2]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = load i8, ptr [[TMP4]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
index 414394a8942e5..d8cbcec318f22 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
@@ -27,41 +27,41 @@ define dso_local void @test1(ptr noalias nocapture %points, ptr noalias nocaptur
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[INDEX]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 3
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <4 x i64> [[VEC_IND]], splat (i64 2)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS:%.*]], i64 [[TMP2]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP4]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP1]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP6]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP1]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS:%.*]], i64 [[TMP2]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP4]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP6]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP8]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP10]], ptr [[TMP3]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 1
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP11]], ptr [[TMP5]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 2
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP12]], ptr [[TMP7]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 3
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP13]], ptr [[TMP9]], align 2
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[INDEX]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP14]], align 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 3
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = or disjoint <4 x i64> [[TMP1]], splat (i64 1)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP15]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP16]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP15]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP18]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP15]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP20]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP15]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP16]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP18]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP20]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP22]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP24]], ptr [[TMP17]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 1
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP25]], ptr [[TMP19]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 2
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP26]], ptr [[TMP21]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 3
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP27]], ptr [[TMP23]], align 2
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
diff --git a/llvm/test/Transforms/LoopVectorize/assume.ll b/llvm/test/Transforms/LoopVectorize/assume.ll
index c81f48ff62afc..23a26b28acec9 100644
--- a/llvm/test/Transforms/LoopVectorize/assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/assume.ll
@@ -8,12 +8,12 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b)
 ; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD]], splat (float 1.000000e+02)
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD1]], splat (float 1.000000e+02)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP3]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP4]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP5]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP3]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP4]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP5]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP6]])
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
index 8397b1ed88dcd..5e96abce8c5e2 100644
--- a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
+++ b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
@@ -21,12 +21,12 @@ define i32 @foo(ptr nocapture %A) {
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = shl <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    store i32 4, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    store i32 4, ptr [[TMP5]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
index 7a54519c7cdf8..35780f207f162 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -15,8 +15,8 @@ define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 4) ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
@@ -26,8 +26,7 @@ define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
 ; CHECK:       [[PRED_LOAD_IF]]:
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
 ; CHECK:       [[PRED_LOAD_CONTINUE]]:
@@ -35,8 +34,7 @@ define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_IF1]]:
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP27]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_CONTINUE2]]:
@@ -191,8 +189,8 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 2) ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 2) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 2) ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
@@ -202,8 +200,7 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
 ; CHECK:       [[PRED_LOAD_IF]]:
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
 ; CHECK:       [[PRED_LOAD_CONTINUE]]:
@@ -211,8 +208,7 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_IF1]]:
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP27]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_CONTINUE2]]:
@@ -291,8 +287,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
@@ -302,8 +298,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
 ; CHECK:       [[PRED_LOAD_IF]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
 ; CHECK:       [[PRED_LOAD_CONTINUE]]:
@@ -311,8 +306,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_IF1]]:
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP15]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_CONTINUE2]]:
@@ -391,8 +385,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
@@ -402,8 +396,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
 ; CHECK:       [[PRED_LOAD_IF]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
 ; CHECK:       [[PRED_LOAD_CONTINUE]]:
@@ -411,8 +404,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_IF1]]:
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP15]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_CONTINUE2]]:
@@ -491,8 +483,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
@@ -502,8 +494,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
 ; CHECK:       [[PRED_LOAD_IF]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
 ; CHECK:       [[PRED_LOAD_CONTINUE]]:
@@ -511,8 +502,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_IF1]]:
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP15]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_CONTINUE2]]:
@@ -686,6 +676,8 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef %
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
@@ -694,7 +686,6 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef %
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
 ; CHECK:       [[PRED_LOAD_IF]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
@@ -703,19 +694,14 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef %
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_IF1]]:
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP23]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_CONTINUE2]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP10]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP15]]
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP28]], i64 4), "dereferenceable"(ptr [[TMP20]], i64 4) ]
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP29]], i64 4), "dereferenceable"(ptr [[TMP19]], i64 4) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP17]], i64 4), "dereferenceable"(ptr [[TMP17]], i64 4) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP22]], i64 4), "dereferenceable"(ptr [[TMP22]], i64 4) ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 0
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP31]], align 4
@@ -792,8 +778,8 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 4) ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
@@ -803,8 +789,7 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
 ; CHECK:       [[PRED_LOAD_IF]]:
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
 ; CHECK:       [[PRED_LOAD_CONTINUE]]:
@@ -812,8 +797,7 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_IF1]]:
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP27]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_CONTINUE2]]:
diff --git a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll
index 6374fba71f2c2..6a5e92d1c96f2 100644
--- a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll
+++ b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll
@@ -47,14 +47,14 @@ define dso_local void @forked_ptrs_different_base_same_offset(ptr nocapture read
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x ptr> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT9]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr float, ptr [[TMP11]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP29]], i64 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 2
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr float, ptr [[TMP13]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP30]], i64 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 3
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr float, ptr [[TMP15]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP31]], i64 12
 ; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP10]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 1ec42d2240485..d83a2c89ec8b8 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1255,8 +1255,8 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[TMP7]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
-; CHECK-NEXT:    store i32 [[TMP9]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1
+; CHECK-NEXT:    store i32 [[TMP9]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1300,8 +1300,8 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; IND-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
 ; IND-NEXT:    [[TMP7:%.*]] = xor <2 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
 ; IND-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP7]], i64 0
-; IND-NEXT:    store i32 [[TMP8]], ptr [[TMP1]], align 8
 ; IND-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i64 1
+; IND-NEXT:    store i32 [[TMP8]], ptr [[TMP1]], align 8
 ; IND-NEXT:    store i32 [[TMP9]], ptr [[TMP2]], align 8
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1354,12 +1354,12 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; UNROLL-NEXT:    [[TMP15:%.*]] = xor <2 x i32> [[TMP10]], [[BROADCAST_SPLAT]]
 ; UNROLL-NEXT:    [[TMP16:%.*]] = xor <2 x i32> [[TMP14]], [[BROADCAST_SPLAT]]
 ; UNROLL-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP15]], i64 0
-; UNROLL-NEXT:    store i32 [[TMP17]], ptr [[TMP3]], align 8
 ; UNROLL-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[TMP15]], i64 1
-; UNROLL-NEXT:    store i32 [[TMP18]], ptr [[TMP4]], align 8
 ; UNROLL-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP16]], i64 0
-; UNROLL-NEXT:    store i32 [[TMP19]], ptr [[TMP5]], align 8
 ; UNROLL-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[TMP16]], i64 1
+; UNROLL-NEXT:    store i32 [[TMP17]], ptr [[TMP3]], align 8
+; UNROLL-NEXT:    store i32 [[TMP18]], ptr [[TMP4]], align 8
+; UNROLL-NEXT:    store i32 [[TMP19]], ptr [[TMP5]], align 8
 ; UNROLL-NEXT:    store i32 [[TMP20]], ptr [[TMP6]], align 8
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1414,12 +1414,12 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = xor <2 x i32> [[TMP11]], [[BROADCAST_SPLAT]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = xor <2 x i32> [[TMP15]], [[BROADCAST_SPLAT]]
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP18]], ptr [[TMP4]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP19]], ptr [[TMP5]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[TMP17]], i32 0
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP20]], ptr [[TMP6]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP18]], ptr [[TMP4]], align 8
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP19]], ptr [[TMP5]], align 8
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP20]], ptr [[TMP6]], align 8
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP21]], ptr [[TMP7]], align 8
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1478,20 +1478,20 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; INTERLEAVE-NEXT:    [[TMP17:%.*]] = xor <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
 ; INTERLEAVE-NEXT:    [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT]]
 ; INTERLEAVE-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP17]], i64 0
-; INTERLEAVE-NEXT:    store i32 [[TMP19]], ptr [[TMP9]], align 8
 ; INTERLEAVE-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP17]], i64 1
-; INTERLEAVE-NEXT:    store i32 [[TMP20]], ptr [[TMP10]], align 8
 ; INTERLEAVE-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP17]], i64 2
-; INTERLEAVE-NEXT:    store i32 [[TMP21]], ptr [[TMP11]], align 8
 ; INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP17]], i64 3
-; INTERLEAVE-NEXT:    store i32 [[TMP22]], ptr [[TMP12]], align 8
 ; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP18]], i64 0
-; INTERLEAVE-NEXT:    store i32 [[TMP23]], ptr [[TMP13]], align 8
 ; INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP18]], i64 1
-; INTERLEAVE-NEXT:    store i32 [[TMP24]], ptr [[TMP14]], align 8
 ; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP18]], i64 2
-; INTERLEAVE-NEXT:    store i32 [[TMP25]], ptr [[TMP15]], align 8
 ; INTERLEAVE-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP18]], i64 3
+; INTERLEAVE-NEXT:    store i32 [[TMP19]], ptr [[TMP9]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP20]], ptr [[TMP10]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP21]], ptr [[TMP11]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP22]], ptr [[TMP12]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP23]], ptr [[TMP13]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP24]], ptr [[TMP14]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP25]], ptr [[TMP15]], align 8
 ; INTERLEAVE-NEXT:    store i32 [[TMP26]], ptr [[TMP16]], align 8
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1570,10 +1570,10 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP13]], align 1, !alias.scope [[META17:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP26]], align 1, !alias.scope [[META17:![0-9]+]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1, !alias.scope [[META17]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP10]], i32 1
@@ -1633,10 +1633,10 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; IND-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 1
 ; IND-NEXT:    [[TMP10:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; IND-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP10]], i64 0
-; IND-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; IND-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i64 1
+; IND-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; IND-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
-; IND-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP12]], align 1, !alias.scope [[META17:![0-9]+]]
+; IND-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP24]], align 1, !alias.scope [[META17:![0-9]+]]
 ; IND-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1, !alias.scope [[META17]]
 ; IND-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1
 ; IND-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]], i32 1
@@ -1700,16 +1700,16 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = shl <2 x i64> [[VEC_IND]], splat (i64 2)
 ; UNROLL-NEXT:    [[TMP13:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 8)
 ; UNROLL-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i64 0
-; UNROLL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
 ; UNROLL-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i64 1
-; UNROLL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
 ; UNROLL-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP13]], i64 0
-; UNROLL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
 ; UNROLL-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP13]], i64 1
+; UNROLL-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
+; UNROLL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
+; UNROLL-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
 ; UNROLL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
-; UNROLL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP15]], align 1, !alias.scope [[META17:![0-9]+]]
-; UNROLL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP17]], align 1, !alias.scope [[META17]]
-; UNROLL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP19]], align 1, !alias.scope [[META17]]
+; UNROLL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP35]], align 1, !alias.scope [[META17:![0-9]+]]
+; UNROLL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP19]], align 1, !alias.scope [[META17]]
+; UNROLL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP36]], align 1, !alias.scope [[META17]]
 ; UNROLL-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP21]], align 1, !alias.scope [[META17]]
 ; UNROLL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1
 ; UNROLL-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]], i32 1
@@ -1779,16 +1779,16 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = shl nsw <2 x i64> [[STEP_ADD]], splat (i64 2)
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP17]]
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]]
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]]
+; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP17]]
+; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]]
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]]
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP16]], align 1, !alias.scope [[META17:![0-9]+]]
-; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP18]], align 1, !alias.scope [[META17]]
-; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP20]], align 1, !alias.scope [[META17]]
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP37]], align 1, !alias.scope [[META17:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP20]], align 1, !alias.scope [[META17]]
+; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP38]], align 1, !alias.scope [[META17]]
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP22]], align 1, !alias.scope [[META17]]
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[TMP9]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP10]], i32 1
@@ -1864,30 +1864,30 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[DOTIDX5]]
 ; INTERLEAVE-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP18]], align 1
 ; INTERLEAVE-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP19]], align 1
-; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1
-; INTERLEAVE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1
-; INTERLEAVE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP12]], i32 1
-; INTERLEAVE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP13]], i32 1
-; INTERLEAVE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP14]], i32 1
-; INTERLEAVE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP15]], i32 1
-; INTERLEAVE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP16]], i32 1
-; INTERLEAVE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP17]], i32 1
 ; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 0
-; INTERLEAVE-NEXT:    store i32 [[TMP28]], ptr [[TMP20]], align 1, !alias.scope [[META17:![0-9]+]], !noalias [[META20:![0-9]+]]
 ; INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 4
-; INTERLEAVE-NEXT:    store i32 [[TMP29]], ptr [[TMP21]], align 1, !alias.scope [[META17]], !noalias [[META20]]
 ; INTERLEAVE-NEXT:    [[TMP30:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 8
-; INTERLEAVE-NEXT:    store i32 [[TMP30]], ptr [[TMP22]], align 1, !alias.scope [[META17]], !noalias [[META20]]
 ; INTERLEAVE-NEXT:    [[TMP31:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 12
-; INTERLEAVE-NEXT:    store i32 [[TMP31]], ptr [[TMP23]], align 1, !alias.scope [[META17]], !noalias [[META20]]
 ; INTERLEAVE-NEXT:    [[TMP32:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 0
-; INTERLEAVE-NEXT:    store i32 [[TMP32]], ptr [[TMP24]], align 1, !alias.scope [[META17]], !noalias [[META20]]
 ; INTERLEAVE-NEXT:    [[TMP33:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 4
-; INTERLEAVE-NEXT:    store i32 [[TMP33]], ptr [[TMP25]], align 1, !alias.scope [[META17]], !noalias [[META20]]
 ; INTERLEAVE-NEXT:    [[TMP34:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 8
-; INTERLEAVE-NEXT:    store i32 [[TMP34]], ptr [[TMP26]], align 1, !alias.scope [[META17]], !noalias [[META20]]
 ; INTERLEAVE-NEXT:    [[TMP35:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 12
-; INTERLEAVE-NEXT:    store i32 [[TMP35]], ptr [[TMP27]], align 1, !alias.scope [[META17]], !noalias [[META20]]
+; INTERLEAVE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1
+; INTERLEAVE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1
+; INTERLEAVE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP12]], i32 1
+; INTERLEAVE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP13]], i32 1
+; INTERLEAVE-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP14]], i32 1
+; INTERLEAVE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP15]], i32 1
+; INTERLEAVE-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP16]], i32 1
+; INTERLEAVE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP17]], i32 1
+; INTERLEAVE-NEXT:    store i32 [[TMP28]], ptr [[TMP41]], align 1, !alias.scope [[META17:![0-9]+]], !noalias [[META20:![0-9]+]]
+; INTERLEAVE-NEXT:    store i32 [[TMP29]], ptr [[TMP42]], align 1, !alias.scope [[META17]], !noalias [[META20]]
+; INTERLEAVE-NEXT:    store i32 [[TMP30]], ptr [[TMP43]], align 1, !alias.scope [[META17]], !noalias [[META20]]
+; INTERLEAVE-NEXT:    store i32 [[TMP31]], ptr [[TMP44]], align 1, !alias.scope [[META17]], !noalias [[META20]]
+; INTERLEAVE-NEXT:    store i32 [[TMP32]], ptr [[TMP45]], align 1, !alias.scope [[META17]], !noalias [[META20]]
+; INTERLEAVE-NEXT:    store i32 [[TMP33]], ptr [[TMP46]], align 1, !alias.scope [[META17]], !noalias [[META20]]
+; INTERLEAVE-NEXT:    store i32 [[TMP34]], ptr [[TMP47]], align 1, !alias.scope [[META17]], !noalias [[META20]]
+; INTERLEAVE-NEXT:    store i32 [[TMP35]], ptr [[TMP48]], align 1, !alias.scope [[META17]], !noalias [[META20]]
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
@@ -2442,11 +2442,11 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0
 ; CHECK-NEXT:    store i16 [[TMP9]], ptr [[TMP7]], align 2
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
 ; CHECK-NEXT:    store i16 [[TMP10]], ptr [[TMP8]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
@@ -2490,11 +2490,11 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; IND-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1
 ; IND-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; IND-NEXT:    [[TMP5:%.*]] = trunc <2 x i32> [[TMP4]] to <2 x i16>
+; IND-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i64 0
+; IND-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1
 ; IND-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1
 ; IND-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP3]], i32 1
-; IND-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i64 0
 ; IND-NEXT:    store i16 [[TMP8]], ptr [[TMP6]], align 2
-; IND-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1
 ; IND-NEXT:    store i16 [[TMP9]], ptr [[TMP7]], align 2
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
@@ -2543,17 +2543,17 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; UNROLL-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]]
 ; UNROLL-NEXT:    [[TMP8:%.*]] = trunc <2 x i32> [[TMP6]] to <2 x i16>
 ; UNROLL-NEXT:    [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16>
+; UNROLL-NEXT:    [[TMP14:%.*]] = extractelement <2 x i16> [[TMP8]], i64 0
+; UNROLL-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP8]], i64 1
+; UNROLL-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0
+; UNROLL-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1
 ; UNROLL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1
 ; UNROLL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP3]], i32 1
 ; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1
 ; UNROLL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1
-; UNROLL-NEXT:    [[TMP14:%.*]] = extractelement <2 x i16> [[TMP8]], i64 0
 ; UNROLL-NEXT:    store i16 [[TMP14]], ptr [[TMP10]], align 2
-; UNROLL-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP8]], i64 1
 ; UNROLL-NEXT:    store i16 [[TMP15]], ptr [[TMP11]], align 2
-; UNROLL-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0
 ; UNROLL-NEXT:    store i16 [[TMP16]], ptr [[TMP12]], align 2
-; UNROLL-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1
 ; UNROLL-NEXT:    store i16 [[TMP17]], ptr [[TMP13]], align 2
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 4)
@@ -2604,17 +2604,17 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]]
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16>
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = trunc <2 x i32> [[TMP8]] to <2 x i16>
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP9]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[TMP3]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP6]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP9]], i32 0
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP15]], ptr [[TMP11]], align 2
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i32 1
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP16]], ptr [[TMP12]], align 2
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i32 0
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP17]], ptr [[TMP13]], align 2
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i32 1
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP18]], ptr [[TMP14]], align 2
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
@@ -2667,6 +2667,14 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]]
 ; INTERLEAVE-NEXT:    [[TMP12:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i16>
 ; INTERLEAVE-NEXT:    [[TMP13:%.*]] = trunc <4 x i32> [[TMP11]] to <4 x i16>
+; INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[TMP12]], i64 0
+; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP12]], i64 1
+; INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP12]], i64 2
+; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP12]], i64 3
+; INTERLEAVE-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[TMP13]], i64 0
+; INTERLEAVE-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[TMP13]], i64 1
+; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[TMP13]], i64 2
+; INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <4 x i16> [[TMP13]], i64 3
 ; INTERLEAVE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP3]], i32 1
 ; INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1
@@ -2675,21 +2683,13 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP7]], i32 1
 ; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP8]], i32 1
 ; INTERLEAVE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP9]], i32 1
-; INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[TMP12]], i64 0
 ; INTERLEAVE-NEXT:    store i16 [[TMP22]], ptr [[TMP14]], align 2
-; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP12]], i64 1
 ; INTERLEAVE-NEXT:    store i16 [[TMP23]], ptr [[TMP15]], align 2
-; INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP12]], i64 2
 ; INTERLEAVE-NEXT:    store i16 [[TMP24]], ptr [[TMP16]], align 2
-; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP12]], i64 3
 ; INTERLEAVE-NEXT:    store i16 [[TMP25]], ptr [[TMP17]], align 2
-; INTERLEAVE-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[TMP13]], i64 0
 ; INTERLEAVE-NEXT:    store i16 [[TMP26]], ptr [[TMP18]], align 2
-; INTERLEAVE-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[TMP13]], i64 1
 ; INTERLEAVE-NEXT:    store i16 [[TMP27]], ptr [[TMP19]], align 2
-; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[TMP13]], i64 2
 ; INTERLEAVE-NEXT:    store i16 [[TMP28]], ptr [[TMP20]], align 2
-; INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <4 x i16> [[TMP13]], i64 3
 ; INTERLEAVE-NEXT:    store i16 [[TMP29]], ptr [[TMP21]], align 2
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 8)
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
index fb4545cb715bb..0ff24c34ff3ca 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
@@ -144,8 +144,8 @@ define void @interleaved_with_cond_store_1(ptr %p, i64 %x, i64 %n) {
 ; CHECK:       pred.store.continue2:
 ; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i64 0
-; CHECK-NEXT:    store i64 [[TMP11]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i64 2
+; CHECK-NEXT:    store i64 [[TMP11]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 80ccc383b67f8..b37f8caa5b457 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -578,6 +578,10 @@ define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture %
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 1021, [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 1020, [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], i64 [[OFFSET_IDX]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP1]], i32 0
@@ -595,21 +599,17 @@ define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture %
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i64 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i64 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3
 ; CHECK-NEXT:    store i64 [[TMP21]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
 ; CHECK-NEXT:    store i64 [[TMP22]], ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
 ; CHECK-NEXT:    store i64 [[TMP23]], ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
 ; CHECK-NEXT:    store i64 [[TMP24]], ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0
 ; CHECK-NEXT:    store i64 [[TMP25]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1
 ; CHECK-NEXT:    store i64 [[TMP26]], ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2
 ; CHECK-NEXT:    store i64 [[TMP27]], ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3
 ; CHECK-NEXT:    store i64 [[TMP28]], ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
@@ -919,12 +919,12 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) {
 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
-; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
-; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
-; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
+; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1000,12 +1000,12 @@ define i32 @PR27626_1(ptr %p, i64 %n) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
-; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
-; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
-; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1097,12 +1097,12 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) {
 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
-; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
-; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
-; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1173,24 +1173,24 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
-; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
-; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
-; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
+; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1364,7 +1364,15 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 5, i64 7, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP7]], i64 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP7]], i64 2
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP7]], i64 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -3)
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP8]], i64 3
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP12]], i64 12
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]]
@@ -1373,21 +1381,13 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP34]], i64 28
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP35]], i64 36
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i64 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP7]], i64 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP7]], i64 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP7]], i64 3
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP8]], i64 0
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP8]], i64 3
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP28]]
 ; CHECK-NEXT:    store i32 [[X:%.*]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP17]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index f801443b85d3f..8554f1fb2c9eb 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -1294,8 +1294,8 @@ define i32 @iv_ext_used_outside( ptr %dst) {
 ; VEC-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP2]], align 4
 ; VEC-NEXT:    [[TMP5:%.*]] = add nuw nsw <2 x i16> [[VEC_IND]], splat (i16 1)
 ; VEC-NEXT:    [[TMP3:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
-; VEC-NEXT:    [[TMP4:%.*]] = zext nneg i16 [[TMP3]] to i32
 ; VEC-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
+; VEC-NEXT:    [[TMP9:%.*]] = zext nneg i16 [[TMP3]] to i32
 ; VEC-NEXT:    [[TMP7:%.*]] = zext nneg i16 [[TMP8]] to i32
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; VEC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index 7a33fd092e293..161b2000e3d9c 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -570,8 +570,8 @@ define i16 @test_strided_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[TMP6]], align 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 2
@@ -664,8 +664,8 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <2 x i32> [[REVERSE]], splat (i32 3)
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
index ffeb3b19d11c4..f8ddd344f5587 100644
--- a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
+++ b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
@@ -192,8 +192,8 @@ define void @no_gep_or_bitcast(ptr noalias %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i64 0
-; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i64 1
+; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
index 1480bc930a5d2..fda6e4811c0f3 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
@@ -821,17 +821,17 @@ define void @multiple_ivs_wide(ptr %dst) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 6
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 8)
@@ -857,17 +857,17 @@ define void @multiple_ivs_wide(ptr %dst) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 6
 ; CHECK-NEXT:    [[TMP19:%.*]] = add <4 x i32> [[VEC_IND2]], splat (i32 2)
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP15]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP16]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP17]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP18]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP24]], ptr [[TMP20]], align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP21]], align 4
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP26]], ptr [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP27]], ptr [[TMP23]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i32 [[INDEX1]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], splat (i32 8)
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index ebddca2294d9c..e84261aaf74dc 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -264,8 +264,8 @@ define void @pr43371() optsize {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    store i16 0, ptr [[TMP5]], align 1
@@ -302,8 +302,8 @@ define void @pr43371() optsize {
 ; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
 ; PGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; PGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; PGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
 ; PGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
 ; PGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
@@ -340,8 +340,8 @@ define void @pr43371() optsize {
 ; NPGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
 ; NPGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; NPGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; NPGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; NPGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; NPGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; NPGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
 ; NPGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
 ; NPGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
@@ -401,8 +401,8 @@ define void @pr43371_pgso() !prof !14 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    store i16 0, ptr [[TMP5]], align 1
@@ -439,8 +439,8 @@ define void @pr43371_pgso() !prof !14 {
 ; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
 ; PGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; PGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; PGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
 ; PGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
 ; PGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll
index 4c04d96d46479..566003114e44a 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll
@@ -17,11 +17,14 @@ define void @wide_ptr_induction_index_width_smaller_than_iv_width(ptr noalias %s
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[SRC]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 8, i32 16, i32 24>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP1]]
@@ -31,11 +34,8 @@ define void @wide_ptr_induction_index_width_smaller_than_iv_width(ptr noalias %s
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP11]], align 8
 ; CHECK-NEXT:    store ptr [[TMP5]], ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 1
 ; CHECK-NEXT:    store ptr [[TMP12]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 2
 ; CHECK-NEXT:    store ptr [[TMP13]], ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 3
 ; CHECK-NEXT:    store ptr [[TMP14]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 32
diff --git a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
index fde93d760696c..98e26dbf1eb94 100644
--- a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
@@ -42,12 +42,12 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP8]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
index 7579fbcb70ef1..db8d96cee011f 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
@@ -230,12 +230,12 @@ for.end:
 ; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or disjoint <4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP24]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP21]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll
index a5cf45c082adb..1ef2667843127 100644
--- a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll
+++ b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll
@@ -30,16 +30,16 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <4 x i32> [[TMP8]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP16]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP11]], align 8
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP14]], align 8
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP15]], align 8
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP16]], align 8
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
index d5a206ff21da0..136b1419d91b8 100644
--- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
@@ -192,8 +192,8 @@ define void @multiple_incoming_phi_with_blend_mask(i64 %a, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[VEC_IND3]], <2 x i16> [[VEC_IND1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll
index 63ca45495335f..abdd5e9562590 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll
@@ -76,10 +76,10 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = call double @foo(double [[TMP4]], i64 [[TMP0]]) #[[ATTR0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call double @foo(double [[TMP3]], i64 [[TMP0]]) #[[ATTR0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i64 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP5]], i64 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[TMP8]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index fe533672f2ca2..533b3496f3b84 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -140,12 +140,12 @@ define void @blend_chain_iv(i1 %c) {
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP10]], <4 x i64> undef, <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI]], <4 x i64> undef
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP7]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    store i16 0, ptr [[TMP4]], align 2
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
index 3aad626554cee..a694a22de110a 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
@@ -102,10 +102,10 @@ define void @ld_div3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -154,20 +154,20 @@ define void @ld_div1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
@@ -211,12 +211,12 @@ define void @ld_div2_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -257,20 +257,20 @@ define void @ld_div3_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
@@ -312,20 +312,20 @@ define void @ld_div1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -367,20 +367,20 @@ define void @ld_div2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -424,12 +424,12 @@ define void @ld_div3_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -511,10 +511,10 @@ define void @ld_div2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -561,10 +561,10 @@ define void @ld_div3_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -614,20 +614,20 @@ define void @ld_div1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
@@ -672,12 +672,12 @@ define void @ld_div2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
-; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
-; CHECK-NEXT:    store i64 [[TMP10]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
@@ -719,20 +719,20 @@ define void @ld_div3_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
@@ -775,20 +775,20 @@ define void @ld_div1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -831,20 +831,20 @@ define void @ld_div2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -889,12 +889,12 @@ define void @ld_div3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
-; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
-; CHECK-NEXT:    store i64 [[TMP10]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
@@ -937,10 +937,10 @@ define void @test_step_is_not_invariant(ptr %A) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = udiv <2 x i16> [[TMP4]], splat (i16 6)
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext <2 x i16> [[TMP5]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    store i16 [[TMP1]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    store i16 [[TMP1]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    store i16 [[TMP2]], ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
index 1f331a4bf973d..a24842e87f5b6 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
@@ -102,10 +102,10 @@ define void @ld_and_neg3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -3)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -154,20 +154,20 @@ define void @ld_and_neg1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
@@ -254,20 +254,20 @@ define void @ld_and_neg1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -309,20 +309,20 @@ define void @ld_and_neg2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -362,10 +362,10 @@ define void @ld_and_neg2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -415,20 +415,20 @@ define void @ld_and_neg2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
@@ -471,20 +471,20 @@ define void @ld_and_neg2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -527,20 +527,20 @@ define void @ld_and_neg3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -3)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
index ea8831c8ab7e5..9a72cd6faed2d 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
@@ -18,28 +18,28 @@ define void @ld_div2_urem3_1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv <8 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP1:%.*]] = urem <8 x i64> [[TMP0]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i64> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i64> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i64> [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i64> [[TMP1]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i64> [[TMP1]], i32 5
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i64> [[TMP1]], i32 6
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i64> [[TMP1]], i32 7
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i64> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i64> [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i64> [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i64> [[TMP1]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> [[TMP1]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i64> [[TMP1]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x i64> poison, i64 [[TMP18]], i32 0
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x i64> [[TMP26]], i64 [[TMP19]], i32 1
@@ -94,28 +94,28 @@ define void @ld_div2_urem3_2(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = udiv <8 x i64> [[TMP0]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP2:%.*]] = urem <8 x i64> [[TMP1]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i64> [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i64> [[TMP2]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i64> [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i64> [[TMP2]], i32 5
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i64> [[TMP2]], i32 6
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i64> [[TMP2]], i32 7
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP14]], align 8
-; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i64> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i64> [[TMP2]], i32 4
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> [[TMP2]], i32 5
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i64> [[TMP2]], i32 6
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i64> [[TMP2]], i32 7
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP18]], align 8
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x i64> poison, i64 [[TMP19]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x i64> [[TMP27]], i64 [[TMP20]], i32 1
@@ -168,28 +168,28 @@ define void @ld_div4(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv <8 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x i64> poison, i64 [[TMP17]], i32 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 1
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
index 1f33f7a15edd4..843fd09c3fd96 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
@@ -107,16 +107,16 @@ define void @ld_lshr1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP0:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP2]], align 8
-; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF4-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i32 0
 ; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[TMP10]], i32 1
@@ -235,20 +235,20 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; VF2-NEXT:    [[TMP2:%.*]] = lshr <2 x i64> [[VEC_IND]], zeroinitializer
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
@@ -273,34 +273,34 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[VEC_IND]], zeroinitializer
 ; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
 ; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
 ; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i32 0
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 1
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 2
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 3
 ; VF4-NEXT:    [[TMP21:%.*]] = add nsw <4 x i64> [[TMP20]], splat (i64 42)
-; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
-; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP22]], align 8
-; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
-; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP23]], align 8
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP22]], ptr [[TMP26]], align 8
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP27]], align 8
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
@@ -344,12 +344,12 @@ define void @ld_lshr1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
 ; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
 ; VF2-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
-; VF2-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 8
+; VF2-NEXT:    store i64 [[TMP7]], ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF2-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -375,18 +375,18 @@ define void @ld_lshr1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
 ; VF4-NEXT:    [[TMP7:%.*]] = add nsw <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    store i64 [[TMP12]], ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    store i64 [[TMP13]], ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    store i64 [[TMP14]], ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    store i64 [[TMP15]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP8]], ptr [[TMP12]], align 8
+; VF4-NEXT:    store i64 [[TMP9]], ptr [[TMP13]], align 8
+; VF4-NEXT:    store i64 [[TMP10]], ptr [[TMP14]], align 8
+; VF4-NEXT:    store i64 [[TMP11]], ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF4-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -427,20 +427,20 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP2:%.*]] = lshr <2 x i64> [[VEC_IND]], zeroinitializer
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -465,34 +465,34 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[VEC_IND]], zeroinitializer
 ; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
 ; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
 ; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i32 0
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 1
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 2
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 3
 ; VF4-NEXT:    [[TMP21:%.*]] = add nsw <4 x i64> [[TMP20]], splat (i64 42)
-; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
-; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP22]], align 8
-; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
-; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP23]], align 8
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP22]], ptr [[TMP26]], align 8
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP27]], align 8
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -534,20 +534,20 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP2:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1)
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -572,34 +572,34 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
 ; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
 ; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i32 0
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 1
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 2
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 3
 ; VF4-NEXT:    [[TMP21:%.*]] = add nsw <4 x i64> [[TMP20]], splat (i64 42)
-; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
-; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP22]], align 8
-; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
-; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP23]], align 8
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP22]], ptr [[TMP26]], align 8
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP27]], align 8
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -640,10 +640,10 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; VF2-NEXT:    [[TMP0:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1)
 ; VF2-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; VF2-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; VF2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; VF2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; VF2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -671,16 +671,16 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; VF4-NEXT:    [[TMP0:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP2]], align 8
-; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF4-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i32 0
 ; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[TMP10]], i32 1
@@ -734,12 +734,12 @@ define void @ld_lshr1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
 ; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP6:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
-; VF2-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
-; VF2-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
-; VF2-NEXT:    store i64 [[TMP10]], ptr [[TMP8]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP7]], ptr [[TMP9]], align 8
+; VF2-NEXT:    store i64 [[TMP8]], ptr [[TMP10]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; VF2-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -766,18 +766,18 @@ define void @ld_lshr1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
 ; VF4-NEXT:    [[TMP8:%.*]] = add nsw <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP8]], i32 0
-; VF4-NEXT:    store i64 [[TMP13]], ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP8]], i32 1
-; VF4-NEXT:    store i64 [[TMP14]], ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP8]], i32 2
-; VF4-NEXT:    store i64 [[TMP15]], ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP8]], i32 3
-; VF4-NEXT:    store i64 [[TMP16]], ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP8]], i32 0
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP8]], i32 1
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP8]], i32 2
+; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP8]], i32 3
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP9]], ptr [[TMP13]], align 8
+; VF4-NEXT:    store i64 [[TMP10]], ptr [[TMP14]], align 8
+; VF4-NEXT:    store i64 [[TMP11]], ptr [[TMP15]], align 8
+; VF4-NEXT:    store i64 [[TMP12]], ptr [[TMP16]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
 ; VF4-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -819,20 +819,20 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP3:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1)
 ; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; VF2-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -858,34 +858,34 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
 ; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> poison, i64 [[TMP14]], i32 0
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 1
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 2
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 3
 ; VF4-NEXT:    [[TMP22:%.*]] = add nsw <4 x i64> [[TMP21]], splat (i64 42)
-; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0
-; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP23]], align 8
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP27]], align 8
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -928,20 +928,20 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP3:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; VF2-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -967,34 +967,34 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 2)
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
 ; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> poison, i64 [[TMP14]], i32 0
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 1
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 2
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 3
 ; VF4-NEXT:    [[TMP22:%.*]] = add nsw <4 x i64> [[TMP21]], splat (i64 42)
-; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0
-; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP23]], align 8
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP27]], align 8
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
index ef6255720d73c..de77020d9034c 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
@@ -18,10 +18,10 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
@@ -52,16 +52,16 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -144,16 +144,16 @@ define void @ld_div2_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -209,10 +209,10 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
@@ -243,16 +243,16 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -311,20 +311,20 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -353,34 +353,34 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -430,20 +430,20 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -472,34 +472,34 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -549,20 +549,20 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -591,34 +591,34 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -668,20 +668,20 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -710,34 +710,34 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -787,20 +787,20 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -829,34 +829,34 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -906,20 +906,20 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -948,34 +948,34 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1023,10 +1023,10 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
@@ -1058,16 +1058,16 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -1124,10 +1124,10 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
@@ -1159,16 +1159,16 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -1225,10 +1225,10 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
@@ -1260,16 +1260,16 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -1329,20 +1329,20 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1372,34 +1372,34 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1450,20 +1450,20 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1493,34 +1493,34 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1571,20 +1571,20 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1614,34 +1614,34 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1692,20 +1692,20 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1735,34 +1735,34 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1813,20 +1813,20 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1856,34 +1856,34 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1934,20 +1934,20 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1977,34 +1977,34 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
index 8d854dd1164c3..5c3ef63c79833 100644
--- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -340,12 +340,12 @@ define void @test_versioned_with_non_ex_use(i32 %offset, ptr noalias %dst.1, ptr
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP17]]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[TMP14]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll
index 4a0a5e3be5b19..fcdb04b494dce 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll
@@ -63,29 +63,29 @@ define void @expand(ptr %src, ptr %dst, i64 %0) {
 ; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr [[SRC]], align 8, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = shl <4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP25]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP27]]
-; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP22]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP24]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP31]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP26]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP33]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP28]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = or disjoint <4 x i64> [[TMP20]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP29]], i32 0
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP30]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP29]], i32 1
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP32]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i64> [[TMP29]], i32 2
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP34]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x i64> [[TMP29]], i32 3
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP30]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP32]]
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP34]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP36]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP31]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP33]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP41]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP35]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP42]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP37]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 732214aa1449e..ca14d1977c411 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -462,8 +462,9 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
 ; CHECK-NEXT:     vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<0> + vp<[[CAN_IV]]> * vp<[[EXP_SCEV]]>
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, vp<[[EXP_SCEV]]>
 ; CHECK-NEXT:     WIDEN ir<%v3> = add nuw ir<%iv>, ir<1>
+; CHECK-NEXT:     EMIT vp<[[SCALARS:%.+]]> = unpack-into-scalars ir<%v3>
 ; CHECK-NEXT:     REPLICATE ir<%gep> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]>
-; CHECK-NEXT:     REPLICATE store ir<%v3>, ir<%gep>
+; CHECK-NEXT:     REPLICATE store vp<[[SCALARS]]>, ir<%gep>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count  vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:   No successors
@@ -905,6 +906,7 @@ define void @zext_nneg(ptr noalias %p, ptr noalias %p1) {
 ; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx>
 ; CHECK-NEXT:    WIDEN ir<%l> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:    WIDEN-CAST ir<%zext> = zext nneg ir<%l>
+; CHECK-NEXT:    EMIT vp<[[SCALARS:%.+]]> = unpack-into-scalars ir<%zext>
 ; CHECK-NEXT:    REPLICATE store ir<%zext>, ir<%p1>
 ; CHECK-NEXT:    EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>
 ; CHECK-NEXT:    EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll
index 59debc5f7904b..e88710b0f4889 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll
@@ -78,7 +78,8 @@ define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly
 ; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%in>, vp<[[STEPS]]>
 ; CHECK-NEXT:      vp<[[IN_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
 ; CHECK-NEXT:      WIDEN ir<%in_val> = load vp<[[IN_VEC_PTR]]>
-; CHECK-NEXT:      REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-NEXT:      EMIT vp<[[UNPACK:%.+]]> = unpack-into-scalars ir<%in_val>
+; CHECK-NEXT:      REPLICATE ir<%call> = call @foo(vp<[[UNPACK]]>)
 ; CHECK-NEXT:      EMIT vp<[[PACK:%.+]]> = pack-into-vector ir<%call>
 ; CHECK-NEXT:      WIDEN ir<%extract_a> = extractvalue vp<[[PACK]]>, ir<0>
 ; CHECK-NEXT:      WIDEN ir<%extract_b> = extractvalue vp<[[PACK]]>, ir<1>

>From 7f53c1cea717f7479330ff7f10f7648f585b8555 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 21 Jun 2025 22:16:42 +0100
Subject: [PATCH 3/4] [VPlan] Go through vector for predicated
 VPReplicateRecipes.

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e48ac81f8857c..324bfb473b262 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1926,14 +1926,21 @@ static void materializePack(VPlan &Plan) {
                           !vputils::isSingleScalar(RepR->getOperand(1)));
         });
       }
-      if (all_of(Def->users(),
-                 [Def](VPUser *U) { return U->usesScalars(Def); }))
+      if (all_of(Def->users(), [Def](VPUser *U) {
+            auto *R = cast<VPRecipeBase>(U);
+            return U->usesScalars(Def) &&
+                   (!R->getParent()->getParent() ||
+                    !R->getParent()->getParent()->getParent());
+          }))
         continue;
 
       auto *Pack = new VPInstruction(VPInstruction::Pack, {Def});
       Pack->insertAfter(Def);
       Def->replaceUsesWithIf(Pack, [Pack, Def](VPUser &U, unsigned) {
-        return &U != Pack && !U.usesScalars(Def) &&
+        auto *R = cast<VPRecipeBase>(&U);
+        return &U != Pack &&
+               (!U.usesScalars(Def) ||
+                (R->getParent()->getParent() && R->getParent()->getParent())) &&
                (!isa<VPInstruction>(&U) ||
                 (cast<VPInstruction>(&U)->getOpcode() !=
                      VPInstruction::ExtractLastElement &&

>From 166ec78f87a051ec99ff91bb12d7c3d3900eac52 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 21 Jun 2025 21:48:16 +0100
Subject: [PATCH 4/4] [VPlan] Unroll by VF

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   1 +
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   7 +
 llvm/lib/Transforms/Vectorize/VPlan.h         |   7 +
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   2 +
 .../Transforms/Vectorize/VPlanPatternMatch.h  |  14 ++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  91 ++++++++-----
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  18 +++
 .../Transforms/Vectorize/VPlanTransforms.h    |   6 +
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 125 ++++++++++++++++++
 ...terleave-ptradd-with-replicated-operand.ll |  33 ++---
 ...6-sunk-instruction-used-outside-of-loop.ll |   2 +
 .../LoopVectorize/first-order-recurrence.ll   |  16 +--
 .../invariant-store-vectorization-2.ll        |   6 +-
 .../LoopVectorize/iv_outside_user.ll          |   3 -
 .../LoopVectorize/load-deref-pred-align.ll    |   2 +
 .../Transforms/LoopVectorize/struct-return.ll |   8 +-
 .../Transforms/LoopVectorize/uniform-blend.ll |   4 +
 17 files changed, 261 insertions(+), 84 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f4259d3d69880..d5c60786d7f9b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7328,6 +7328,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // cost model is complete for better cost estimates.
   VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
                            OrigLoop->getHeader()->getContext());
+  VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
   VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
     VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index c0675c7003298..2521b2b93e6c3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -261,6 +261,13 @@ Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) {
     return Data.VPV2Scalars[Def][0];
   }
 
+  // Look through BuildVector to avoid redundant extracts.
+  // TODO: Remove once replicate regions are unrolled explicitly.
+  if (Lane.getKind() == VPLane::Kind::First && match(Def, m_BuildVector())) {
+    auto *BuildVector = cast<VPInstruction>(Def);
+    return get(BuildVector->getOperand(Lane.getKnownLane()), true);
+  }
+
   assert(hasVectorValue(Def));
   auto *VecPart = Data.VPV2Vector[Def];
   if (!VecPart->getType()->isVectorTy()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e0201ff334ce1..9ffdd481a784c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -936,6 +936,13 @@ class VPInstruction : public VPRecipeWithIRFlags,
     BranchOnCount,
     BranchOnCond,
     Broadcast,
+    /// Given operands of (the same) struct type, creates a struct of fixed-
+    /// width vectors each containing a struct field of all operands. The
+    /// number of operands matches the element count of every vector.
+    BuildStructVector,
+    /// Creates a fixed-width vector containing all operands. The number of
+    /// operands matches the vector element count.
+    BuildVector,
     ComputeAnyOfResult,
     ComputeFindLastIVResult,
     ComputeReductionResult,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index c1c2ca7d8e639..9fda3406604b1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -110,6 +110,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::AnyOf:
+  case VPInstruction::BuildStructVector:
+  case VPInstruction::BuildVector:
     return SetResultTyFromOp();
   case VPInstruction::FirstActiveLane:
     return Type::getIntNTy(Ctx, 64);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index ae91e310f7592..8178f9b87a88e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -224,6 +224,9 @@ struct Recipe_match {
     if ((!matchRecipeAndOpcode<RecipeTys>(R) && ...))
       return false;
 
+    auto *VPI = dyn_cast<VPInstruction>(R);
+    if (VPI && VPI->getOpcode() == VPInstruction::BuildVector)
+      return true;
     assert(R->getNumOperands() == std::tuple_size<Ops_t>::value &&
            "recipe with matched opcode does not have the expected number of "
            "operands");
@@ -263,6 +266,10 @@ struct Recipe_match {
   }
 };
 
+template <unsigned Opcode, typename... RecipeTys>
+using ZeroOpRecipe_match =
+    Recipe_match<std::tuple<>, Opcode, false, RecipeTys...>;
+
 template <typename Op0_t, unsigned Opcode, typename... RecipeTys>
 using UnaryRecipe_match =
     Recipe_match<std::tuple<Op0_t>, Opcode, false, RecipeTys...>;
@@ -271,6 +278,9 @@ template <typename Op0_t, unsigned Opcode>
 using UnaryVPInstruction_match =
     UnaryRecipe_match<Op0_t, Opcode, VPInstruction>;
 
+template <unsigned Opcode>
+using ZeroOpVPInstruction_match = ZeroOpRecipe_match<Opcode, VPInstruction>;
+
 template <typename Op0_t, unsigned Opcode>
 using AllUnaryRecipe_match =
     UnaryRecipe_match<Op0_t, Opcode, VPWidenRecipe, VPReplicateRecipe,
@@ -302,6 +312,10 @@ using AllBinaryRecipe_match =
     BinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative, VPWidenRecipe,
                        VPReplicateRecipe, VPWidenCastRecipe, VPInstruction>;
 
+inline ZeroOpVPInstruction_match<VPInstruction::BuildVector> m_BuildVector() {
+  return ZeroOpVPInstruction_match<VPInstruction::BuildVector>();
+}
+
 template <unsigned Opcode, typename Op0_t>
 inline UnaryVPInstruction_match<Op0_t, Opcode>
 m_VPInstruction(const Op0_t &Op0) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1e8cf8c912901..c0b284208e21a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -504,9 +504,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
   }
   case Instruction::ExtractElement: {
     assert(State.VF.isVector() && "Only extract elements from vectors");
-    Value *Vec = State.get(getOperand(0));
-    Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
-    return Builder.CreateExtractElement(Vec, Idx, Name);
+    unsigned IdxToExtract =
+        cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue();
+    return State.get(getOperand(0), VPLane(IdxToExtract));
   }
   case Instruction::Freeze: {
     Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
@@ -617,6 +617,32 @@ Value *VPInstruction::generate(VPTransformState &State) {
     return Builder.CreateVectorSplat(
         State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
   }
+  case VPInstruction::BuildStructVector: {
+    // For struct types, we need to build a new 'wide' struct type, where each
+    // element is widened, i.e. we crate a struct of vectors .
+    auto *StructTy =
+        cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0)));
+    Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
+    for (const auto &[Idx, Op] : enumerate(operands())) {
+      for (unsigned I = 0; I != StructTy->getNumElements(); I++) {
+        Value *ScalarValue = Builder.CreateExtractValue(State.get(Op, true), I);
+        Value *VectorValue = Builder.CreateExtractValue(Res, I);
+        VectorValue =
+            Builder.CreateInsertElement(VectorValue, ScalarValue, Idx);
+        Res = Builder.CreateInsertValue(Res, VectorValue, I);
+      }
+    }
+    return Res;
+  }
+  case VPInstruction::BuildVector: {
+    auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
+    auto NumOfElements = ElementCount::getFixed(getNumOperands());
+    Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
+    for (const auto &[Idx, Op] : enumerate(operands()))
+      Res = State.Builder.CreateInsertElement(Res, State.get(Op, true),
+                                              State.Builder.getInt32(Idx));
+    return Res;
+  }
   case VPInstruction::ReductionStartVector: {
     if (State.VF.isScalar())
       return State.get(getOperand(0), true);
@@ -935,6 +961,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case Instruction::ICmp:
   case Instruction::Select:
   case VPInstruction::AnyOf:
+  case VPInstruction::BuildStructVector:
+  case VPInstruction::BuildVector:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::ExtractLastElement:
@@ -1059,6 +1087,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::Broadcast:
     O << "broadcast";
     break;
+  case VPInstruction::BuildStructVector:
+    O << "buildstructvector";
+    break;
+  case VPInstruction::BuildVector:
+    O << "buildvector";
+    break;
   case VPInstruction::ExtractLastElement:
     O << "extract-last-element";
     break;
@@ -2660,45 +2694,30 @@ static void scalarizeInstruction(const Instruction *Instr,
 
 void VPReplicateRecipe::execute(VPTransformState &State) {
   Instruction *UI = getUnderlyingInstr();
-  if (State.Lane) { // Generate a single instance.
-    assert((State.VF.isScalar() || !isSingleScalar()) &&
-           "uniform recipe shouldn't be predicated");
-    assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
-    scalarizeInstruction(UI, this, *State.Lane, State);
-    // Insert scalar instance packing it into a vector.
-    if (State.VF.isVector() && shouldPack()) {
-      Value *WideValue;
-      // If we're constructing lane 0, initialize to start from poison.
-      if (State.Lane->isFirstLane()) {
-        assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
-        WideValue = PoisonValue::get(VectorType::get(UI->getType(), State.VF));
-      } else {
-        WideValue = State.get(this);
-      }
-      State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
-                                                          *State.Lane));
-    }
-    return;
-  }
 
-  if (IsSingleScalar) {
-    // Uniform within VL means we need to generate lane 0.
+  if (!State.Lane) {
+    assert(IsSingleScalar &&
+           "VPReplicateRecipes outside replicate regions must be unrolled");
     scalarizeInstruction(UI, this, VPLane(0), State);
     return;
   }
 
-  // A store of a loop varying value to a uniform address only needs the last
-  // copy of the store.
-  if (isa<StoreInst>(UI) && vputils::isSingleScalar(getOperand(1))) {
-    auto Lane = VPLane::getLastLaneForVF(State.VF);
-    scalarizeInstruction(UI, this, VPLane(Lane), State);
-    return;
+  assert((State.VF.isScalar() || !isSingleScalar()) &&
+         "uniform recipe shouldn't be predicated");
+  scalarizeInstruction(UI, this, *State.Lane, State);
+  // Insert scalar instance packing it into a vector.
+  if (State.VF.isVector() && shouldPack()) {
+    Value *WideValue;
+    // If we're constructing lane 0, initialize to start from poison.
+    if (State.Lane->isFirstLane()) {
+      assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
+      WideValue = PoisonValue::get(VectorType::get(UI->getType(), State.VF));
+    } else {
+      WideValue = State.get(this);
+    }
+    State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
+                                                        *State.Lane));
   }
-
-  // Generate scalar instances for all VF lanes.
-  const unsigned EndLane = State.VF.getFixedValue();
-  for (unsigned Lane = 0; Lane < EndLane; ++Lane)
-    scalarizeInstruction(UI, this, VPLane(Lane), State);
 }
 
 bool VPReplicateRecipe::shouldPack() const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 324bfb473b262..29bc53d2e37ab 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1140,6 +1140,24 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
+  // Look through ExtractLastElement (BuildVector ....).
+  if (match(&R, m_VPInstruction<VPInstruction::ExtractLastElement>(
+                    m_BuildVector()))) {
+    auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
+    Def->replaceAllUsesWith(
+        BuildVector->getOperand(BuildVector->getNumOperands() - 1));
+    return;
+  }
+
+  // Look through ExtractPenultimateElement (BuildVector ....).
+  if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
+                    m_BuildVector()))) {
+    auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
+    Def->replaceAllUsesWith(
+        BuildVector->getOperand(BuildVector->getNumOperands() - 2));
+    return;
+  }
+
   // Some simplifications can only be applied after unrolling. Perform them
   // below.
   if (!Plan->isUnrolled())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 7e51c05d1b5b5..40885cd52a127 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -99,6 +99,12 @@ struct VPlanTransforms {
   /// Explicitly unroll \p Plan by \p UF.
   static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx);
 
+  /// Replace each VPReplicateRecipe outside on any replicate region in \p Plan
+  /// with \p VF single-scalar recipes.
+  /// TODO: Also replicate VPReplicateRecipes inside replicate regions, thereby
+  /// dissolving the latter.
+  static void replicateByVF(VPlan &Plan, ElementCount VF);
+
   /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
   /// resulting plan to \p BestVF and \p BestUF.
   static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 0bc683e557e70..cea56765924c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -15,6 +15,7 @@
 #include "VPlan.h"
 #include "VPlanAnalysis.h"
 #include "VPlanCFG.h"
+#include "VPlanHelpers.h"
 #include "VPlanPatternMatch.h"
 #include "VPlanTransforms.h"
 #include "VPlanUtils.h"
@@ -450,3 +451,127 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
 
   VPlanTransforms::removeDeadRecipes(Plan);
 }
+
+struct VPReplicateUnroller {
+  VPlan &Plan;
+  Type *IdxTy;
+  DenseMap<VPValue *, SmallVector<VPValue *>> Rep2LaneDefs;
+
+  VPReplicateUnroller(VPlan &Plan, Type *IdxTy) : Plan(Plan), IdxTy(IdxTy) {}
+
+  void addLaneDef(VPValue *Def, VPValue *LaneDef) {
+    const auto &[LaneDefs, _] = Rep2LaneDefs.insert({Def, {}});
+    LaneDefs->second.push_back(LaneDef);
+  }
+
+  VPValue *getLane(VPValue *Op, const VPLane &Lane, VPBuilder &Builder) {
+    const auto &LaneDefs = Rep2LaneDefs.lookup(Op);
+    if (LaneDefs.empty()) {
+      if (Lane.getKind() == VPLane::Kind::ScalableLast) {
+        return Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op});
+      }
+      VPValue *Idx =
+          Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+      return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
+    }
+    assert(Lane.getKind() != VPLane::Kind::ScalableLast);
+    return LaneDefs[Lane.getKnownLane()];
+  }
+
+  VPValue *getLane0(VPReplicateRecipe *RepR) {
+    VPBuilder B;
+    return getLane(RepR, VPLane::getFirstLane(), B);
+  }
+};
+
+/// Create a single-scalar clone of \p RepR for lane \p Lane.
+static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
+                                       Type *IdxTy, VPReplicateRecipe *RepR,
+                                       VPLane Lane,
+                                       VPReplicateUnroller &State) {
+  // Collect the operands at Lane, creating extracts as needed.
+  SmallVector<VPValue *> NewOps;
+  for (VPValue *Op : RepR->operands()) {
+    if (vputils::isSingleScalar(Op)) {
+      NewOps.push_back(Op);
+      continue;
+    }
+    NewOps.push_back(State.getLane(Op, Lane, Builder));
+  }
+
+  auto *New =
+      new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
+                            /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
+  New->insertBefore(RepR);
+  return New;
+}
+
+void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
+  Type *IdxTy = IntegerType::get(
+      Plan.getScalarHeader()->getIRBasicBlock()->getContext(), 32);
+  VPReplicateUnroller State(Plan, IdxTy);
+  SmallVector<VPRecipeBase *> ToRemove;
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+      auto *Pack = dyn_cast<VPInstruction>(&R);
+      if (Pack && Pack->getOpcode() == VPInstruction::Pack) {
+        auto LaneDefs = State.Rep2LaneDefs.lookup(Pack->getOperand(0));
+        if (!LaneDefs.empty()) {
+          auto *RepR = cast<VPReplicateRecipe>(Pack->getOperand(0));
+          // If needed, create a Build(Struct)Vector recipe to insert the scalar
+          // lane values into a vector.
+          Type *ResTy = RepR->getUnderlyingInstr()->getType();
+          VPBuilder Builder(Pack);
+          VPValue *VecRes = Builder.createNaryOp(
+              ResTy->isStructTy() ? VPInstruction::BuildStructVector
+                                  : VPInstruction::BuildVector,
+              LaneDefs);
+          Pack->replaceAllUsesWith(VecRes);
+          Pack->eraseFromParent();
+        } else {
+          assert(!isa<VPReplicateRecipe>(Pack->getOperand(0)));
+        }
+        continue;
+      }
+      if (Pack && Pack->getOpcode() == VPInstruction::Unpack) {
+        VPBuilder Builder(Pack);
+
+        auto *Def = Pack->getOperand(0);
+        for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) {
+          VPValue *Idx = Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, I));
+          State.addLaneDef(Pack, Builder.createNaryOp(
+                                     Instruction::ExtractElement, {Def, Idx}));
+        }
+        continue;
+      }
+
+      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
+      if (!RepR || RepR->isSingleScalar())
+        continue;
+
+      VPBuilder Builder(RepR);
+      ToRemove.push_back(RepR);
+      // Stores to invariant addresses need to store the last lane only.
+      if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
+          vputils::isSingleScalar(RepR->getOperand(1))) {
+        cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF),
+                     State);
+        continue;
+      }
+
+      /// Create single-scalar version of RepR for all lanes.
+      for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
+        State.addLaneDef(
+            RepR, cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), State));
+
+      /// Users that only demand the first lane can use the definition for lane
+      /// 0.
+      RepR->replaceUsesWithIf(
+          State.getLane0(RepR),
+          [RepR](VPUser &U, unsigned) { return U.onlyFirstLaneUsed(RepR); });
+    }
+  }
+  for (auto *R : reverse(ToRemove))
+    R->eraseFromParent();
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
index 4b6fd1338c32c..a46cd6987ec1c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
@@ -32,42 +32,31 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 104
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 112
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 120
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP0]]
+; CHECK-NEXT:    [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP3]]
-; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP4]]
+; CHECK-NEXT:    [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP7]]
-; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP8]]
+; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP10]]
-; CHECK-NEXT:    [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP11]]
-; CHECK-NEXT:    [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP12]]
-; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP13]]
-; CHECK-NEXT:    [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP14]]
+; CHECK-NEXT:    [[NEXT_GEP17:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP11]]
+; CHECK-NEXT:    [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP12]]
+; CHECK-NEXT:    [[NEXT_GEP18:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP13]]
+; CHECK-NEXT:    [[NEXT_GEP19:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 4
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 4
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 4
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP4]], i64 4
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 4
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[NEXT_GEP6]], i64 4
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP7]], i64 4
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[NEXT_GEP8]], i64 4
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP9]], i64 4
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[NEXT_GEP10]], i64 4
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[NEXT_GEP11]], i64 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[NEXT_GEP12]], i64 4
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[NEXT_GEP13]], i64 4
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[NEXT_GEP14]], i64 4
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[NEXT_GEP15]], i64 4
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[NEXT_GEP16]], i64 4
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[TMP16]], i32 -4
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP20]], i32 -4
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[TMP24]], i32 -4
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP28]], i32 -4
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[TMP27]], i32 -4
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP28]], i32 -4
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[TMP29]], i32 -4
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP30]], i32 -4
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP32]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC17:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll
index 644f10b617eb7..db9be20f7d190 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll
@@ -10,6 +10,8 @@ define ptr @test(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x ptr> [[TMP16]], ptr [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i64> [[VEC_IND]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index fe4f3527dfca8..910cfabda4b76 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -1055,16 +1055,10 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 10
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 12
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 14
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP3]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]]
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
+; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP9]]
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -1152,16 +1146,10 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; SINK-AFTER-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4
 ; SINK-AFTER-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 6
-; SINK-AFTER-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP3]], 2
-; SINK-AFTER-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 2
 ; SINK-AFTER-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], 2
 ; SINK-AFTER-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 2
-; SINK-AFTER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]]
-; SINK-AFTER-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; SINK-AFTER-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
+; SINK-AFTER-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP9]]
 ; SINK-AFTER-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
-; SINK-AFTER-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4
-; SINK-AFTER-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
 ; SINK-AFTER-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
 ; SINK-AFTER-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
index 0306567c5ce98..a02fddc4cf72d 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
@@ -134,18 +134,14 @@ define void @inv_val_store_to_inv_address_conditional_inv(ptr %a, i64 %n, ptr %b
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i1> poison, i1 [[CMP]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT6]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 0
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META12:![0-9]+]]
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[A]], align 4, !alias.scope [[META12]]
+; CHECK-NEXT:    store i32 [[K]], ptr [[A]], align 4, !alias.scope [[META12]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index 8554f1fb2c9eb..5f9f87ffeecdc 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -1101,7 +1101,6 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) {
 ; VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]]
 ; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
 ; VEC-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP2]], align 2
-; VEC-NEXT:    [[TMP4:%.*]] = add i32 [[STEP_2]], [[TMP0]]
 ; VEC-NEXT:    [[TMP5:%.*]] = add i32 [[STEP_2]], [[TMP6]]
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; VEC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
@@ -1389,9 +1388,7 @@ define i64 @test_iv_increment_incremented(ptr %dst) {
 ; VEC-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[TMP0]], i32 0
 ; VEC-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 -1
 ; VEC-NEXT:    store <2 x i16> splat (i16 1), ptr [[TMP2]], align 2
-; VEC-NEXT:    [[TMP3:%.*]] = add i64 2, -1
 ; VEC-NEXT:    [[TMP5:%.*]] = add i64 1, -1
-; VEC-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], 1
 ; VEC-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 1
 ; VEC-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VEC:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index 161b2000e3d9c..f8d0c2737d7bc 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -669,6 +669,8 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP12]], i32 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll
index 50b9ba12af82d..0c5cf09430bb9 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -79,20 +79,20 @@ define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly
 ; CHECK:         [[CALL_LANE_1:%.*]] = tail call { float, float } @foo(float {{%.*}})
 ;                // Lane 0
 ; CHECK:         [[A_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 0
-; CHECK:         [[VEC_A_0:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
+; CHECK:         [[VEC_A_0:%.*]] = insertelement <2 x float> poison, float [[A_0]], i64 0
 ; CHECK:         [[WIDE_A_0:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VEC_A_0]], 0
 ; CHECK:         [[B_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 1
 ; CHECK:         [[UNDEF_B_0:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], 1
-; CHECK:         [[VEC_B_0:%.*]] = insertelement <2 x float> [[UNDEF_B_0]], float [[B_0]], i32 0
+; CHECK:         [[VEC_B_0:%.*]] = insertelement <2 x float> [[UNDEF_B_0]], float [[B_0]], i64 0
 ; CHECK:         [[WIDE_0:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], <2 x float> [[VEC_B_0]], 1
 ;                // Lane 1
 ; CHECK:         [[A_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 0
 ; CHECK:         [[VEC_A_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_0]], 0
-; CHECK:         [[VEC_A:%.*]] = insertelement <2 x float> [[VEC_A_0_EXT]], float [[A_1]], i32 1
+; CHECK:         [[VEC_A:%.*]] = insertelement <2 x float> [[VEC_A_0_EXT]], float [[A_1]], i64 1
 ; CHECK:         [[WIDE_A:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_0]], <2 x float> [[VEC_A]], 0
 ; CHECK:         [[B_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 1
 ; CHECK:         [[VEC_B_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A]], 1
-; CHECK:         [[VEC_B:%.*]] = insertelement <2 x float> [[VEC_B_0_EXT]], float [[B_1]], i32 1
+; CHECK:         [[VEC_B:%.*]] = insertelement <2 x float> [[VEC_B_0_EXT]], float [[B_1]], i64 1
 ; CHECK:         [[WIDE:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A]], <2 x float> [[VEC_B]], 1
 ;                // Store wide values:
 ; CHECK:         [[VEC_A_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 533b3496f3b84..fadbaaa228608 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -229,6 +229,10 @@ define void @redundant_branch_and_blends_without_mask(ptr %A) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x ptr> [[TMP35]], ptr [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x ptr> [[TMP36]], ptr [[TMP7]], i32 2
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x ptr> [[TMP37]], ptr [[TMP8]], i32 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
 ; CHECK:       [[PRED_LOAD_IF]]:



More information about the llvm-commits mailing list